Chrunos commited on
Commit
220037e
·
verified ·
1 Parent(s): 2df456e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -24
app.py CHANGED
@@ -132,16 +132,15 @@ async def get_transcript(youtube_url: str):
132
  try:
133
  ydl_opts = {
134
  'skip_download': True,
135
- 'writesubtitles': True, # 仅下载手动上传的字幕
136
- 'writeautomaticsub': False, # 禁用自动生成字幕
137
- 'subtitleslangs': ['original'], # 关键参数:只获取原始语言
138
  'subtitlesformat': 'best',
139
  'outtmpl': '%(id)s.%(ext)s',
140
  'noplaylist': True,
141
  'cookiefile': "firefox-cookies.txt",
142
- # 禁用所有翻译功能
143
- 'postprocessors': [],
144
- 'compat_opts': {'no-sub-translate'},
145
  }
146
 
147
  env_to_cookies_from_env("firefox-cookies.txt")
@@ -150,35 +149,55 @@ async def get_transcript(youtube_url: str):
150
  info = ydl.extract_info(youtube_url, download=True)
151
  video_id = info['id']
152
 
153
- # 获取原始语言代码(如视频是英语则返回en)
154
- original_lang = info.get('original_lang') or 'en'
155
-
156
- # 查找匹配原始语言的字幕文件
157
- subtitle_pattern = f"{video_id}.{original_lang}.*"
158
- subtitle_files = [f for f in os.listdir('.')
159
- if re.match(subtitle_pattern, f)]
 
 
 
160
 
161
- logger.info(f"原始语言字幕文件: {subtitle_files}")
 
 
 
 
 
 
162
 
163
  if subtitle_files:
164
- # 选择第一个匹配的文件
165
- subtitle_file = subtitle_files[0]
166
- with open(subtitle_file, 'r', encoding='utf-8') as f:
 
 
167
  content = f.read()
168
-
 
 
 
 
 
 
 
 
169
  return {
170
- "transcript": content,
171
  "language": original_lang,
172
- "is_translated": False
173
  }
174
-
175
  return {
176
- "error": "无原始语言字幕",
177
- "available_langs": list(info.get('subtitles', {}).keys())
 
178
  }
179
 
180
  except Exception as e:
181
- return {"error": str(e)}
182
 
183
 
184
 
 
132
  try:
133
  ydl_opts = {
134
  'skip_download': True,
135
+ 'writesubtitles': True,
136
+ 'writeautomaticsub': True, # 启用自动字幕作为备选
137
+ 'subtitleslangs': ['original'], # 优先原始语言
138
  'subtitlesformat': 'best',
139
  'outtmpl': '%(id)s.%(ext)s',
140
  'noplaylist': True,
141
  'cookiefile': "firefox-cookies.txt",
142
+ 'compat_opts': {'no-sub-translate'}, # 保持原始语言
143
+ 'retries': 3,
 
144
  }
145
 
146
  env_to_cookies_from_env("firefox-cookies.txt")
 
149
  info = ydl.extract_info(youtube_url, download=True)
150
  video_id = info['id']
151
 
152
+ # 精确获取原始语言(修复YouTube API返回不一致问题)
153
+ original_lang = info.get('original_lang')
154
+ if not original_lang:
155
+ original_lang = info.get('subtitles', {}).get('original', [{}])[0].get('language')
156
+ original_lang = original_lang or 'en'
157
+
158
+ # 扩展匹配模式(包含自动生成字幕)
159
+ subtitle_pattern = re.compile(
160
+ rf"^{re.escape(video_id)}\.(a\.)?{re.escape(original_lang)}\..+$"
161
+ )
162
 
163
+ # 查找所有可能匹配的文件
164
+ subtitle_files = [
165
+ f for f in os.listdir('.')
166
+ if subtitle_pattern.match(f)
167
+ ]
168
+
169
+ logger.info(f"匹配到的字幕文件: {subtitle_files}")
170
 
171
  if subtitle_files:
172
+ # 优先手动上传的字幕(非自动生成)
173
+ preferred_files = [f for f in subtitle_files if 'a.' not in f]
174
+ selected_file = preferred_files[0] if preferred_files else subtitle_files[0]
175
+
176
+ with open(selected_file, 'r', encoding='utf-8') as f:
177
  content = f.read()
178
+
179
+ # 清理字幕内容
180
+ clean_content = re.sub(
181
+ r'^\d+\n\d{2}:\d{2}:\d{2}[\.,]\d{3} --> \d{2}:\d{2}:\d{2}[\.,]\d{3}\n',
182
+ '',
183
+ content,
184
+ flags=re.MULTILINE
185
+ ).strip()
186
+
187
  return {
188
+ "transcript": clean_content,
189
  "language": original_lang,
190
+ "sub_type": "manual" if 'a.' not in selected_file else "auto-generated"
191
  }
192
+
193
  return {
194
+ "error": "未找到原始语言字幕",
195
+ "available_langs": list(info.get('subtitles', {}).keys())
196
+ + list(info.get('automatic_captions', {}).keys())
197
  }
198
 
199
  except Exception as e:
200
+ return {"error": f"处理失败: {str(e)}"}
201
 
202
 
203