Chrunos commited on
Commit
3913791
·
verified ·
1 Parent(s): 220037e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -42
app.py CHANGED
@@ -133,14 +133,13 @@ async def get_transcript(youtube_url: str):
133
  ydl_opts = {
134
  'skip_download': True,
135
  'writesubtitles': True,
136
- 'writeautomaticsub': True, # 启用自动字幕作为备选
137
- 'subtitleslangs': ['original'], # 优先原始语言
138
  'subtitlesformat': 'best',
139
  'outtmpl': '%(id)s.%(ext)s',
140
  'noplaylist': True,
141
  'cookiefile': "firefox-cookies.txt",
142
- 'compat_opts': {'no-sub-translate'}, # 保持原始语言
143
- 'retries': 3,
144
  }
145
 
146
  env_to_cookies_from_env("firefox-cookies.txt")
@@ -149,55 +148,43 @@ async def get_transcript(youtube_url: str):
149
  info = ydl.extract_info(youtube_url, download=True)
150
  video_id = info['id']
151
 
152
- # 精确获取原始语言(修复YouTube API返回不一致问题)
153
- original_lang = info.get('original_lang')
154
- if not original_lang:
155
- original_lang = info.get('subtitles', {}).get('original', [{}])[0].get('language')
156
- original_lang = original_lang or 'en'
157
-
158
- # 扩展匹配模式(包含自动生成字幕)
159
- subtitle_pattern = re.compile(
160
- rf"^{re.escape(video_id)}\.(a\.)?{re.escape(original_lang)}\..+$"
161
- )
162
 
163
- # 查找所有可能匹配的文件
164
- subtitle_files = [
165
- f for f in os.listdir('.')
166
- if subtitle_pattern.match(f)
167
- ]
168
-
169
- logger.info(f"匹配到的字幕文件: {subtitle_files}")
170
-
171
- if subtitle_files:
172
- # 优先手动上传的字幕(非自动生成)
173
- preferred_files = [f for f in subtitle_files if 'a.' not in f]
174
- selected_file = preferred_files[0] if preferred_files else subtitle_files[0]
175
-
176
- with open(selected_file, 'r', encoding='utf-8') as f:
 
 
177
  content = f.read()
178
-
179
- # 清理字幕内容
180
  clean_content = re.sub(
181
- r'^\d+\n\d{2}:\d{2}:\d{2}[\.,]\d{3} --> \d{2}:\d{2}:\d{2}[\.,]\d{3}\n',
182
  '',
183
- content,
184
- flags=re.MULTILINE
185
  ).strip()
186
-
187
  return {
188
  "transcript": clean_content,
189
- "language": original_lang,
190
- "sub_type": "manual" if 'a.' not in selected_file else "auto-generated"
191
  }
192
 
193
- return {
194
- "error": "未找到原始语言字幕",
195
- "available_langs": list(info.get('subtitles', {}).keys())
196
- + list(info.get('automatic_captions', {}).keys())
197
- }
198
 
199
  except Exception as e:
200
- return {"error": f"处理失败: {str(e)}"}
201
 
202
 
203
 
 
133
  ydl_opts = {
134
  'skip_download': True,
135
  'writesubtitles': True,
136
+ 'writeautomaticsub': True,
137
+ 'subtitleslangs': ['all'], # 获取所有字幕
138
  'subtitlesformat': 'best',
139
  'outtmpl': '%(id)s.%(ext)s',
140
  'noplaylist': True,
141
  'cookiefile': "firefox-cookies.txt",
142
+ 'ignoreerrors': True,
 
143
  }
144
 
145
  env_to_cookies_from_env("firefox-cookies.txt")
 
148
  info = ydl.extract_info(youtube_url, download=True)
149
  video_id = info['id']
150
 
151
+ # 合并手动和自动字幕列表
152
+ all_subs = list(info.get('subtitles', {}).keys()) + list(info.get('automatic_captions', {}).keys())
 
 
 
 
 
 
 
 
153
 
154
+ # 过滤有效语言代码(排除翻译字幕如en-zh-Hant)
155
+ valid_langs = [lang for lang in all_subs if '-' not in lang.split('.')[-1]]
156
+
157
+ if not valid_langs:
158
+ return {"error": "无可用字幕"}
159
+
160
+ # 直接取第一个有效语言
161
+ target_lang = valid_langs[0]
162
+
163
+ # 构建字幕文件名模式
164
+ subtitle_file = f"{video_id}.{target_lang}.vtt" # 优先vtt格式
165
+ if not os.path.exists(subtitle_file):
166
+ subtitle_file = f"{video_id}.{target_lang}.srt"
167
+
168
+ if os.path.exists(subtitle_file):
169
+ with open(subtitle_file, 'r', encoding='utf-8') as f:
170
  content = f.read()
171
+
172
+ # 基础清理
173
  clean_content = re.sub(
174
+ r'\d{2}:\d{2}:\d{2}[,.]\d{3}.*?\n',
175
  '',
176
+ content
 
177
  ).strip()
178
+
179
  return {
180
  "transcript": clean_content,
181
+ "detected_lang": target_lang
 
182
  }
183
 
184
+ return {"error": "字幕文件未找到"}
 
 
 
 
185
 
186
  except Exception as e:
187
+ return {"error": str(e)}
188
 
189
 
190