Chrunos commited on
Commit
f8061b4
·
verified ·
1 Parent(s): 3d3b80f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -37
app.py CHANGED
@@ -134,14 +134,20 @@ async def get_transcript(youtube_url: str):
134
  'skip_download': True,
135
  'writesubtitles': True,
136
  'writeautomaticsub': True,
137
- # 同时支持中英文及变体
138
- 'subtitleslangs': ['zh*', 'en*'], # 包含所有中文和英语变体
 
 
 
 
 
139
  'subtitlesformat': 'best',
140
  'outtmpl': '%(id)s.%(ext)s',
141
  'noplaylist': True,
142
  'cookiefile': "firefox-cookies.txt",
143
- 'retries': 3,
144
- 'sleep_interval': 10, # 增加请求间隔
 
145
  'ignoreerrors': True,
146
  }
147
 
@@ -151,56 +157,72 @@ async def get_transcript(youtube_url: str):
151
  info = ydl.extract_info(youtube_url, download=True)
152
  video_id = info['id']
153
 
154
- # 语言优先级列表(中文 > 英文 > 其他)
155
- priority_order = [
156
- 'zh', 'zh-Hans', 'zh-Hant', 'zh-CN', 'zh-TW', # 中文
157
- 'en', 'en-US', 'en-GB', # 英文
158
- ]
159
 
160
- # 获取所有字幕文件并按优先级排序
161
- subtitle_files = []
 
 
 
 
 
 
 
 
162
  for f in os.listdir('.'):
163
  if f.startswith(video_id):
164
- lang_part = f.split('.')[-2]
165
- # 检查是否包含优先语言代码
166
- for lang in priority_order:
167
- if lang in lang_part:
168
- subtitle_files.append(f)
169
- break
170
-
171
- # 按优先级排序文件
172
- subtitle_files = sorted(
173
- subtitle_files,
174
- key=lambda x: min(
175
- priority_order.index(lang)
176
- for lang in priority_order
177
- if lang in x
178
- )
179
- )
180
-
181
- if subtitle_files:
182
- selected_file = subtitle_files[0]
183
- with open(selected_file, 'r', encoding='utf-8') as f:
184
  content = f.read()
185
 
186
- # 统一清理逻辑
187
  clean_text = re.sub(
188
- r'\d{2}:\d{2}:\d{2}[\.\,]\d{3}.*?(\n|$)|'
189
- r'<.*?>|{.*?}|^WEBVTT$',
 
190
  '',
191
  content,
192
  flags=re.MULTILINE
193
  ).strip()
194
 
 
 
 
195
  return {
196
  "transcript": clean_text,
197
- "detected_lang": selected_file.split('.')[-2]
 
198
  }
199
 
200
- return {"error": "No subtitles found"}
 
 
 
201
 
202
  except Exception as e:
203
- return {"error": str(e)}
 
 
 
 
 
 
 
 
204
 
205
 
206
 
 
134
  'skip_download': True,
135
  'writesubtitles': True,
136
  'writeautomaticsub': True,
137
+ # 精确指定中英文语言代码
138
+ 'subtitleslangs': [
139
+ 'zh', 'zh-Hans', 'zh-Hant', 'zh-CN', 'zh-TW', # 中文
140
+ 'en', 'en-US', 'en-GB', 'en-AU' # 英文
141
+ ],
142
+ # 允许自动生成字幕作为后备
143
+ 'subtitleslangs_automatic': True,
144
  'subtitlesformat': 'best',
145
  'outtmpl': '%(id)s.%(ext)s',
146
  'noplaylist': True,
147
  'cookiefile': "firefox-cookies.txt",
148
+ # 优化网络请求参数
149
+ 'retries': 5,
150
+ 'sleep_interval': 15,
151
  'ignoreerrors': True,
152
  }
153
 
 
157
  info = ydl.extract_info(youtube_url, download=True)
158
  video_id = info['id']
159
 
160
+ # 获取实际可用的字幕语言
161
+ available_langs = []
162
+ for sub_type in ['subtitles', 'automatic_captions']:
163
+ subs = info.get(sub_type, {})
164
+ available_langs.extend(subs.keys())
165
 
166
+ logger.info(f"Available subtitle languages: {list(set(available_langs))}")
167
+
168
+ # 构建优先级列表(中文 > 英文 > 其他)
169
+ priority_map = {
170
+ 'zh': 0, 'zh-Hans': 1, 'zh-Hant': 2, 'zh-CN': 3, 'zh-TW': 4,
171
+ 'en': 5, 'en-US': 6, 'en-GB': 7, 'en-AU': 8
172
+ }
173
+
174
+ # 查找最佳匹配字幕文件
175
+ best_sub = None
176
  for f in os.listdir('.'):
177
  if f.startswith(video_id):
178
+ parts = f.split('.')
179
+ if len(parts) >= 3:
180
+ lang_code = parts[-2]
181
+ # 处理复合语言代码(如 en-US)
182
+ base_lang = lang_code.split('-')[0]
183
+ priority = priority_map.get(base_lang, 99)
184
+ if not best_sub or priority < best_sub[1]:
185
+ best_sub = (f, priority)
186
+
187
+ if best_sub:
188
+ subtitle_file = best_sub[0]
189
+ with open(subtitle_file, 'r', encoding='utf-8') as f:
 
 
 
 
 
 
 
 
190
  content = f.read()
191
 
192
+ # 增强型文本清理
193
  clean_text = re.sub(
194
+ r'\d{2}:\d{2}:\d{2}[,.]\d{3}.*?(\n|$)|' # 时间戳
195
+ r'<[^>]+>|{[^}]+}|^\s*WEBVTT\s*$|' # HTML/样式标签
196
+ r'^\d+\s*$', # 序号行
197
  '',
198
  content,
199
  flags=re.MULTILINE
200
  ).strip()
201
 
202
+ # 合并重复的空行
203
+ clean_text = re.sub(r'\n{3,}', '\n\n', clean_text)
204
+
205
  return {
206
  "transcript": clean_text,
207
+ "detected_lang": subtitle_file.split('.')[-2],
208
+ "available_languages": available_langs
209
  }
210
 
211
+ return {
212
+ "error": "No subtitles available",
213
+ "available_languages": available_langs
214
+ }
215
 
216
  except Exception as e:
217
+ logger.error(f"Error: {str(e)}")
218
+ return {
219
+ "error": f"Processing failed: {str(e)}",
220
+ "advice": [
221
+ "Try adding '&tlang=en' to URL for translated subs",
222
+ "Check if cookies are still valid",
223
+ "Reduce request frequency if seeing 429 errors"
224
+ ]
225
+ }
226
 
227
 
228