Update app.py
Browse files
app.py
CHANGED
@@ -133,9 +133,9 @@ async def get_transcript(youtube_url: str):
|
|
133 |
'skip_download': True,
|
134 |
'writesubtitles': True,
|
135 |
'writeautomaticsub': True,
|
136 |
-
'subtitleslangs': ['en'], #
|
137 |
-
'subtitlesformat': '
|
138 |
-
'outtmpl': '%(id)s', # Output template
|
139 |
'noplaylist': True,
|
140 |
'cookiefile': "firefox-cookies.txt",
|
141 |
}
|
@@ -146,38 +146,33 @@ async def get_transcript(youtube_url: str):
|
|
146 |
info = ydl.extract_info(youtube_url, download=False)
|
147 |
video_id = info['id']
|
148 |
|
149 |
-
#
|
150 |
-
sub_ext =
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
# Parse both VTT and SRT formats
|
167 |
-
lines = content.split('\n')
|
168 |
-
text_lines = []
|
169 |
-
for line in lines:
|
170 |
-
# Skip timestamps and metadata
|
171 |
-
if '-->' in line or line.strip().isdigit() or line.startswith('WEBVTT'):
|
172 |
-
continue
|
173 |
-
if line.strip():
|
174 |
-
text_lines.append(line.strip())
|
175 |
-
|
176 |
-
return {"transcript": ' '.join(text_lines)}
|
177 |
-
else:
|
178 |
-
return {"transcript": f"Found subtitles but file {subtitle_file} missing"}
|
179 |
else:
|
180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
|
182 |
except Exception as e:
|
183 |
raise HTTPException(status_code=500, detail=str(e))
|
|
|
133 |
'skip_download': True,
|
134 |
'writesubtitles': True,
|
135 |
'writeautomaticsub': True,
|
136 |
+
'subtitleslangs': ['en'], # Target language
|
137 |
+
'subtitlesformat': 'json3', # Force JSON format
|
138 |
+
'outtmpl': '%(id)s', # Output template
|
139 |
'noplaylist': True,
|
140 |
'cookiefile': "firefox-cookies.txt",
|
141 |
}
|
|
|
146 |
info = ydl.extract_info(youtube_url, download=False)
|
147 |
video_id = info['id']
|
148 |
|
149 |
+
# Get actual downloaded subtitle format
|
150 |
+
sub_ext = 'json3' # Since we're forcing json3 format
|
151 |
+
lang = ydl_opts['subtitleslangs'][0]
|
152 |
+
|
153 |
+
subtitle_file = f"{video_id}.{lang}.{sub_ext}"
|
154 |
+
if os.path.exists(subtitle_file):
|
155 |
+
with open(subtitle_file, 'r', encoding='utf-8') as f:
|
156 |
+
subs = json.load(f)
|
157 |
+
|
158 |
+
# Extract text from JSON3 format
|
159 |
+
text = ' '.join(
|
160 |
+
[event['segs'][0]['utf8']
|
161 |
+
for event in subs['events']
|
162 |
+
if 'segs' in event and event['segs']
|
163 |
+
]
|
164 |
+
)
|
165 |
+
return {"transcript": text}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
else:
|
167 |
+
# Fallback check for other possible formats
|
168 |
+
possible_exts = ['vtt', 'srt', 'ttml', 'json3']
|
169 |
+
for ext in possible_exts:
|
170 |
+
fallback_file = f"{video_id}.{lang}.{ext}"
|
171 |
+
if os.path.exists(fallback_file):
|
172 |
+
# Handle other formats if needed
|
173 |
+
return {"transcript": f"Found {ext} but parsing not implemented"}
|
174 |
+
|
175 |
+
return {"transcript": f"No subtitle file found for {video_id}"}
|
176 |
|
177 |
except Exception as e:
|
178 |
raise HTTPException(status_code=500, detail=str(e))
|