Chrunos commited on
Commit
e862404
·
verified ·
1 Parent(s): c8574d4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -31
app.py CHANGED
@@ -133,54 +133,51 @@ async def get_transcript(youtube_url: str):
133
  'skip_download': True,
134
  'writesubtitles': True,
135
  'writeautomaticsub': True,
136
- 'subtitleslangs': ['en'], # Adjust regex if needed, e.g., 'en.*'
137
- 'subtitlesformat': 'srt', # Use 'srt' for easier parsing
138
- 'outtmpl': '%(id)s', # Output template without ext to avoid conflicts
139
  'noplaylist': True,
140
  'cookiefile': "firefox-cookies.txt",
141
  }
142
 
143
- # Load cookies (ensure this function correctly sets up the cookie file)
144
  env_to_cookies_from_env("firefox-cookies.txt")
145
 
146
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
147
  info = ydl.extract_info(youtube_url, download=False)
148
  video_id = info['id']
149
 
150
- # Check for manually created subtitles
151
- subtitles = info.get('subtitles', {})
152
- # Check for auto-generated captions
153
- auto_subtitles = info.get('automatic_captions', {})
154
-
155
- transcript_lang = None
156
- # Prioritize manual subs
157
  for lang in ydl_opts['subtitleslangs']:
158
- if lang in subtitles:
159
- transcript_lang = lang
160
- break
161
- # Fallback to auto subs
162
- if not transcript_lang:
163
- for lang in ydl_opts['subtitleslangs']:
164
- if lang in auto_subtitles:
165
- transcript_lang = lang
166
  break
 
 
167
 
168
- if transcript_lang:
169
- # Look for the downloaded subtitle file
170
- subtitle_file = f"{video_id}.{transcript_lang}.srt"
171
  if os.path.exists(subtitle_file):
172
  with open(subtitle_file, 'r', encoding='utf-8') as f:
173
- srt_content = f.read()
174
- # Simple parsing to extract text (consider using a library like pysrt)
175
- text = ' '.join(line.strip() for line in srt_content.split('\n')
176
- if not line.strip().isdigit()
177
- and '-->' not in line
178
- and line.strip())
179
- return {'transcript': text}
 
 
 
 
 
 
180
  else:
181
- return {'transcript': 'Subtitle file not found'}
182
  else:
183
- return {'transcript': 'No transcript available'}
184
 
185
  except Exception as e:
186
  raise HTTPException(status_code=500, detail=str(e))
 
133
  'skip_download': True,
134
  'writesubtitles': True,
135
  'writeautomaticsub': True,
136
+ 'subtitleslangs': ['en'], # Try 'en.*' for all English variants
137
+ 'subtitlesformat': 'best', # Let yt-dlp choose best available format
138
+ 'outtmpl': '%(id)s', # Output template without extension
139
  'noplaylist': True,
140
  'cookiefile': "firefox-cookies.txt",
141
  }
142
 
 
143
  env_to_cookies_from_env("firefox-cookies.txt")
144
 
145
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
146
  info = ydl.extract_info(youtube_url, download=False)
147
  video_id = info['id']
148
 
149
+ # Find available subtitle format
150
+ sub_ext = None
 
 
 
 
 
151
  for lang in ydl_opts['subtitleslangs']:
152
+ for sub_type in ['subtitles', 'automatic_captions']:
153
+ subs = info.get(sub_type, {}).get(lang, [])
154
+ if subs:
155
+ sub_ext = subs[0].get('ext', 'vtt')
 
 
 
 
156
  break
157
+ if sub_ext:
158
+ break
159
 
160
+ if sub_ext:
161
+ subtitle_file = f"{video_id}.{lang}.{sub_ext}"
 
162
  if os.path.exists(subtitle_file):
163
  with open(subtitle_file, 'r', encoding='utf-8') as f:
164
+ content = f.read()
165
+
166
+ # Parse both VTT and SRT formats
167
+ lines = content.split('\n')
168
+ text_lines = []
169
+ for line in lines:
170
+ # Skip timestamps and metadata
171
+ if '-->' in line or line.strip().isdigit() or line.startswith('WEBVTT'):
172
+ continue
173
+ if line.strip():
174
+ text_lines.append(line.strip())
175
+
176
+ return {"transcript": ' '.join(text_lines)}
177
  else:
178
+ return {"transcript": f"Found subtitles but file {subtitle_file} missing"}
179
  else:
180
+ return {"transcript": "No subtitles available in requested languages"}
181
 
182
  except Exception as e:
183
  raise HTTPException(status_code=500, detail=str(e))