Chrunos commited on
Commit
f2946ca
·
verified ·
1 Parent(s): e862404

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -34
app.py CHANGED
@@ -133,9 +133,9 @@ async def get_transcript(youtube_url: str):
133
  'skip_download': True,
134
  'writesubtitles': True,
135
  'writeautomaticsub': True,
136
- 'subtitleslangs': ['en'], # Try 'en.*' for all English variants
137
- 'subtitlesformat': 'best', # Let yt-dlp choose best available format
138
- 'outtmpl': '%(id)s', # Output template without extension
139
  'noplaylist': True,
140
  'cookiefile': "firefox-cookies.txt",
141
  }
@@ -146,38 +146,33 @@ async def get_transcript(youtube_url: str):
146
  info = ydl.extract_info(youtube_url, download=False)
147
  video_id = info['id']
148
 
149
- # Find available subtitle format
150
- sub_ext = None
151
- for lang in ydl_opts['subtitleslangs']:
152
- for sub_type in ['subtitles', 'automatic_captions']:
153
- subs = info.get(sub_type, {}).get(lang, [])
154
- if subs:
155
- sub_ext = subs[0].get('ext', 'vtt')
156
- break
157
- if sub_ext:
158
- break
159
-
160
- if sub_ext:
161
- subtitle_file = f"{video_id}.{lang}.{sub_ext}"
162
- if os.path.exists(subtitle_file):
163
- with open(subtitle_file, 'r', encoding='utf-8') as f:
164
- content = f.read()
165
-
166
- # Parse both VTT and SRT formats
167
- lines = content.split('\n')
168
- text_lines = []
169
- for line in lines:
170
- # Skip timestamps and metadata
171
- if '-->' in line or line.strip().isdigit() or line.startswith('WEBVTT'):
172
- continue
173
- if line.strip():
174
- text_lines.append(line.strip())
175
-
176
- return {"transcript": ' '.join(text_lines)}
177
- else:
178
- return {"transcript": f"Found subtitles but file {subtitle_file} missing"}
179
  else:
180
- return {"transcript": "No subtitles available in requested languages"}
 
 
 
 
 
 
 
 
181
 
182
  except Exception as e:
183
  raise HTTPException(status_code=500, detail=str(e))
 
133
  'skip_download': True,
134
  'writesubtitles': True,
135
  'writeautomaticsub': True,
136
+ 'subtitleslangs': ['en'], # Target language
137
+ 'subtitlesformat': 'json3', # Force JSON format
138
+ 'outtmpl': '%(id)s', # Output template
139
  'noplaylist': True,
140
  'cookiefile': "firefox-cookies.txt",
141
  }
 
146
  info = ydl.extract_info(youtube_url, download=False)
147
  video_id = info['id']
148
 
149
+ # Get actual downloaded subtitle format
150
+ sub_ext = 'json3' # Since we're forcing json3 format
151
+ lang = ydl_opts['subtitleslangs'][0]
152
+
153
+ subtitle_file = f"{video_id}.{lang}.{sub_ext}"
154
+ if os.path.exists(subtitle_file):
155
+ with open(subtitle_file, 'r', encoding='utf-8') as f:
156
+ subs = json.load(f)
157
+
158
+ # Extract text from JSON3 format
159
+ text = ' '.join(
160
+ [event['segs'][0]['utf8']
161
+ for event in subs['events']
162
+ if 'segs' in event and event['segs']
163
+ ]
164
+ )
165
+ return {"transcript": text}
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  else:
167
+ # Fallback check for other possible formats
168
+ possible_exts = ['vtt', 'srt', 'ttml', 'json3']
169
+ for ext in possible_exts:
170
+ fallback_file = f"{video_id}.{lang}.{ext}"
171
+ if os.path.exists(fallback_file):
172
+ # Handle other formats if needed
173
+ return {"transcript": f"Found {ext} but parsing not implemented"}
174
+
175
+ return {"transcript": f"No subtitle file found for {video_id}"}
176
 
177
  except Exception as e:
178
  raise HTTPException(status_code=500, detail=str(e))