Chrunos commited on
Commit
942ffc3
·
verified ·
1 Parent(s): 26dfd2b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -29
app.py CHANGED
@@ -134,54 +134,93 @@ async def get_transcript(youtube_url: str):
134
  'skip_download': True,
135
  'writesubtitles': True,
136
  'writeautomaticsub': True,
137
- 'subtitleslangs': ['en'],
 
 
138
  'subtitlesformat': 'best',
139
- 'outtmpl': '%(id)s.%(ext)s',
 
140
  'noplaylist': True,
141
- 'cookiefile': "firefox-cookies.txt"
 
 
142
  }
 
143
  env_to_cookies_from_env("firefox-cookies.txt")
144
- # Show current directory structure before download
145
- logger.info(f"Current directory files (before): {os.listdir('.')}")
146
-
147
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
148
  info = ydl.extract_info(youtube_url, download=True)
149
  video_id = info['id']
150
- logger.info(f"Video ID: {video_id}")
151
 
 
 
 
 
 
 
 
 
152
 
153
- # Check actual downloaded files
154
- logger.info(f"Current directory files (after extraction): {os.listdir('.')}")
 
 
 
 
 
 
 
155
 
156
- # Search for subtitle files pattern
157
- subtitle_files = [f for f in os.listdir('.')
158
- if f.startswith(video_id) and ('en' in f)]
159
- logger.info(f"Potential subtitle files: {subtitle_files}")
160
 
161
  if subtitle_files:
162
- # Process the first found subtitle file
163
  subtitle_file = subtitle_files[0]
164
- logger.info(f"Processing subtitle file: {subtitle_file}")
165
 
166
  with open(subtitle_file, 'r', encoding='utf-8') as f:
167
  content = f.read()
168
-
169
- # Add format-specific parsing
170
  if subtitle_file.endswith('.json3'):
171
- import json
172
  subs = json.loads(content)
173
- text = ' '.join([e['segs'][0]['utf8'] for e in subs['events'] if e.get('segs')])
174
- elif subtitle_file.endswith('.vtt'):
175
- text = ' '.join(line.strip() for line in content.split('\n')
176
- if not line.startswith('WEBVTT')
177
- and '-->' not in line
178
- and not line.strip().isdigit())
 
 
 
 
 
 
179
  else:
180
- text = f"Unsupported format: {subtitle_file}"
181
-
182
- return {"transcript": text}
183
-
184
- return {"transcript": f"No subtitle files found for {video_id}"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
  except Exception as e:
187
  logger.error(f"Error: {str(e)}", exc_info=True)
 
134
  'skip_download': True,
135
  'writesubtitles': True,
136
  'writeautomaticsub': True,
137
+ # 扩展支持的语言列表(包含中英文)
138
+ 'subtitleslangs': ['en', 'zh', 'zh-Hans', 'zh-Hant', 'zh-CN', 'zh-TW', 'a.*'],
139
+ # 自动选择最佳字幕格式
140
  'subtitlesformat': 'best',
141
+ # 包含原始语言代码在文件名中
142
+ 'outtmpl': '%(id)s.%(ext)s',
143
  'noplaylist': True,
144
+ 'cookiefile': "firefox-cookies.txt",
145
+ # 添加中文字幕兼容性参数
146
+ 'compat_opts': {'no-youtube-unavailable-videos'},
147
  }
148
+
149
  env_to_cookies_from_env("firefox-cookies.txt")
150
+
 
 
151
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
152
  info = ydl.extract_info(youtube_url, download=True)
153
  video_id = info['id']
 
154
 
155
+ # 获取实际可用的字幕语言列表
156
+ available_subs = []
157
+ for sub_type in ['subtitles', 'automatic_captions']:
158
+ subs = info.get(sub_type, {})
159
+ available_subs.extend([
160
+ f"{lang}.{track[0]['ext']}"
161
+ for lang, track in subs.items()
162
+ ])
163
 
164
+ # 优先顺序:中文 > 英文 > 其他语言
165
+ priority_langs = ['zh', 'en']
166
+ subtitle_files = sorted(
167
+ [f for f in os.listdir('.') if f.startswith(video_id)],
168
+ key=lambda x: (
169
+ -max([x.find(lang) for lang in priority_langs]),
170
+ len(x)
171
+ )
172
+ )
173
 
174
+ logger.info(f"Detected subtitle files: {subtitle_files}")
 
 
 
175
 
176
  if subtitle_files:
 
177
  subtitle_file = subtitle_files[0]
178
+ logger.info(f"Selected subtitle: {subtitle_file}")
179
 
180
  with open(subtitle_file, 'r', encoding='utf-8') as f:
181
  content = f.read()
182
+
183
+ # 统一处理不同字幕格式
184
  if subtitle_file.endswith('.json3'):
 
185
  subs = json.loads(content)
186
+ text = ''.join(
187
+ e['segs'][0]['utf8']
188
+ for e in subs['events']
189
+ if e.get('segs')
190
+ )
191
+ elif subtitle_file.endswith(('.vtt', '.srt')):
192
+ text = re.sub(
193
+ r'\d{2}:\d{2}:\d{2}[\.,]\d{3}.*?\n',
194
+ '',
195
+ content
196
+ )
197
+ text = re.sub(r'<.*?>|{.*?}', '', text)
198
  else:
199
+ text = content # 原始内容
200
+
201
+ # 中文标点标准化
202
+ text = text.translate(str.maketrans({
203
+ ',': ',',
204
+ '。': '.',
205
+ ';': ';',
206
+ '!': '!',
207
+ '?': '?',
208
+ '“': '"',
209
+ '”': '"',
210
+ '‘': "'",
211
+ '’': "'"
212
+ }))
213
+
214
+ return {
215
+ "transcript": text,
216
+ "detected_language": subtitle_file.split('.')[-2],
217
+ "available_languages": available_subs
218
+ }
219
+
220
+ return {
221
+ "error": "No subtitles found",
222
+ "available_languages": available_subs
223
+ }
224
 
225
  except Exception as e:
226
  logger.error(f"Error: {str(e)}", exc_info=True)