Chrunos commited on
Commit
2df456e
·
verified ·
1 Parent(s): f8061b4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -69
app.py CHANGED
@@ -132,23 +132,16 @@ async def get_transcript(youtube_url: str):
132
  try:
133
  ydl_opts = {
134
  'skip_download': True,
135
- 'writesubtitles': True,
136
- 'writeautomaticsub': True,
137
- # 精确指定中英文语言代码
138
- 'subtitleslangs': [
139
- 'zh', 'zh-Hans', 'zh-Hant', 'zh-CN', 'zh-TW', # 中文
140
- 'en', 'en-US', 'en-GB', 'en-AU' # 英文
141
- ],
142
- # 允许自动生成字幕作为后备
143
- 'subtitleslangs_automatic': True,
144
  'subtitlesformat': 'best',
145
  'outtmpl': '%(id)s.%(ext)s',
146
  'noplaylist': True,
147
  'cookiefile': "firefox-cookies.txt",
148
- # 优化网络请求参数
149
- 'retries': 5,
150
- 'sleep_interval': 15,
151
- 'ignoreerrors': True,
152
  }
153
 
154
  env_to_cookies_from_env("firefox-cookies.txt")
@@ -157,72 +150,35 @@ async def get_transcript(youtube_url: str):
157
  info = ydl.extract_info(youtube_url, download=True)
158
  video_id = info['id']
159
 
160
- # 获取实际可用的字幕语言
161
- available_langs = []
162
- for sub_type in ['subtitles', 'automatic_captions']:
163
- subs = info.get(sub_type, {})
164
- available_langs.extend(subs.keys())
165
 
166
- logger.info(f"Available subtitle languages: {list(set(available_langs))}")
167
-
168
- # 构建优先级列表(中文 > 英文 > 其他)
169
- priority_map = {
170
- 'zh': 0, 'zh-Hans': 1, 'zh-Hant': 2, 'zh-CN': 3, 'zh-TW': 4,
171
- 'en': 5, 'en-US': 6, 'en-GB': 7, 'en-AU': 8
172
- }
173
 
174
- # 查找最佳匹配字幕文件
175
- best_sub = None
176
- for f in os.listdir('.'):
177
- if f.startswith(video_id):
178
- parts = f.split('.')
179
- if len(parts) >= 3:
180
- lang_code = parts[-2]
181
- # 处理复合语言代码(如 en-US)
182
- base_lang = lang_code.split('-')[0]
183
- priority = priority_map.get(base_lang, 99)
184
- if not best_sub or priority < best_sub[1]:
185
- best_sub = (f, priority)
186
-
187
- if best_sub:
188
- subtitle_file = best_sub[0]
189
  with open(subtitle_file, 'r', encoding='utf-8') as f:
190
  content = f.read()
191
-
192
- # 增强型文本清理
193
- clean_text = re.sub(
194
- r'\d{2}:\d{2}:\d{2}[,.]\d{3}.*?(\n|$)|' # 时间戳
195
- r'<[^>]+>|{[^}]+}|^\s*WEBVTT\s*$|' # HTML/样式标签
196
- r'^\d+\s*$', # 序号行
197
- '',
198
- content,
199
- flags=re.MULTILINE
200
- ).strip()
201
-
202
- # 合并重复的空行
203
- clean_text = re.sub(r'\n{3,}', '\n\n', clean_text)
204
-
205
  return {
206
- "transcript": clean_text,
207
- "detected_lang": subtitle_file.split('.')[-2],
208
- "available_languages": available_langs
209
  }
210
-
211
  return {
212
- "error": "No subtitles available",
213
- "available_languages": available_langs
214
  }
215
 
216
  except Exception as e:
217
- logger.error(f"Error: {str(e)}")
218
- return {
219
- "error": f"Processing failed: {str(e)}",
220
- "advice": [
221
- "Try adding '&tlang=en' to URL for translated subs",
222
- "Check if cookies are still valid",
223
- "Reduce request frequency if seeing 429 errors"
224
- ]
225
- }
226
 
227
 
228
 
 
132
  try:
133
  ydl_opts = {
134
  'skip_download': True,
135
+ 'writesubtitles': True, # 仅下载手动上传的字幕
136
+ 'writeautomaticsub': False, # 禁用自动生成字幕
137
+ 'subtitleslangs': ['original'], # 关键参数:只获取原始语言
 
 
 
 
 
 
138
  'subtitlesformat': 'best',
139
  'outtmpl': '%(id)s.%(ext)s',
140
  'noplaylist': True,
141
  'cookiefile': "firefox-cookies.txt",
142
+ # 禁用所有翻译功能
143
+ 'postprocessors': [],
144
+ 'compat_opts': {'no-sub-translate'},
 
145
  }
146
 
147
  env_to_cookies_from_env("firefox-cookies.txt")
 
150
  info = ydl.extract_info(youtube_url, download=True)
151
  video_id = info['id']
152
 
153
+ # 获取原始语言代码(如视频是英语则返回en)
154
+ original_lang = info.get('original_lang') or 'en'
 
 
 
155
 
156
+ # 查找匹配原始语言的字幕文件
157
+ subtitle_pattern = f"{video_id}.{original_lang}.*"
158
+ subtitle_files = [f for f in os.listdir('.')
159
+ if re.match(subtitle_pattern, f)]
 
 
 
160
 
161
+ logger.info(f"原始语言字幕文件: {subtitle_files}")
162
+
163
+ if subtitle_files:
164
+ # 选择第一个匹配的文件
165
+ subtitle_file = subtitle_files[0]
 
 
 
 
 
 
 
 
 
 
166
  with open(subtitle_file, 'r', encoding='utf-8') as f:
167
  content = f.read()
168
+
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  return {
170
+ "transcript": content,
171
+ "language": original_lang,
172
+ "is_translated": False
173
  }
174
+
175
  return {
176
+ "error": "无原始语言字幕",
177
+ "available_langs": list(info.get('subtitles', {}).keys())
178
  }
179
 
180
  except Exception as e:
181
+ return {"error": str(e)}
 
 
 
 
 
 
 
 
182
 
183
 
184