Chrunos commited on
Commit
888ac00
·
verified ·
1 Parent(s): 3913791

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -28
app.py CHANGED
@@ -132,59 +132,73 @@ async def get_transcript(youtube_url: str):
132
  try:
133
  ydl_opts = {
134
  'skip_download': True,
135
- 'writesubtitles': True,
136
- 'writeautomaticsub': True,
137
- 'subtitleslangs': ['all'], # 获取所有字幕
138
  'subtitlesformat': 'best',
139
  'outtmpl': '%(id)s.%(ext)s',
140
  'noplaylist': True,
141
  'cookiefile': "firefox-cookies.txt",
142
  'ignoreerrors': True,
 
 
 
 
 
 
143
  }
144
 
145
  env_to_cookies_from_env("firefox-cookies.txt")
146
 
147
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
148
- info = ydl.extract_info(youtube_url, download=True)
 
149
  video_id = info['id']
150
 
151
- # 合并手动和自动字幕列表
152
- all_subs = list(info.get('subtitles', {}).keys()) + list(info.get('automatic_captions', {}).keys())
 
 
 
 
 
153
 
154
- # 过滤有效语言代码(排除翻译字幕如en-zh-Hant)
155
- valid_langs = [lang for lang in all_subs if '-' not in lang.split('.')[-1]]
156
 
157
- if not valid_langs:
158
- return {"error": "无可用字幕"}
159
-
160
- # 直接取第一个有效语言
161
- target_lang = valid_langs[0]
162
-
163
- # 构建字幕文件名模式
164
- subtitle_file = f"{video_id}.{target_lang}.vtt" # 优先vtt格式
165
- if not os.path.exists(subtitle_file):
166
- subtitle_file = f"{video_id}.{target_lang}.srt"
167
 
168
- if os.path.exists(subtitle_file):
169
- with open(subtitle_file, 'r', encoding='utf-8') as f:
170
  content = f.read()
171
 
172
- # 基础清理
173
  clean_content = re.sub(
174
- r'\d{2}:\d{2}:\d{2}[,.]\d{3}.*?\n',
175
- '',
176
- content
 
177
  ).strip()
178
 
179
  return {
180
  "transcript": clean_content,
181
- "detected_lang": target_lang
 
182
  }
183
-
184
- return {"error": "字幕文件未找到"}
 
 
 
185
 
186
  except Exception as e:
187
- return {"error": str(e)}
188
 
189
 
190
 
 
132
  try:
133
  ydl_opts = {
134
  'skip_download': True,
135
+ 'writesubtitles': True, # 仅下载手动上传的字幕
136
+ 'writeautomaticsub': False, # 禁用自动生成字幕
137
+ 'subtitleslangs': ['orig'], # 关键参数:只获取原始语言
138
  'subtitlesformat': 'best',
139
  'outtmpl': '%(id)s.%(ext)s',
140
  'noplaylist': True,
141
  'cookiefile': "firefox-cookies.txt",
142
  'ignoreerrors': True,
143
+ # 精准控制参数
144
+ 'postprocessors': [],
145
+ 'compat_opts': [
146
+ 'no-youtube-unavailable-videos',
147
+ 'no-sub-translate'
148
+ ],
149
  }
150
 
151
  env_to_cookies_from_env("firefox-cookies.txt")
152
 
153
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
154
+ # 精确控制下载行为
155
+ info = ydl.extract_info(youtube_url, download=False) # 先获取元数据
156
  video_id = info['id']
157
 
158
+ # 直接构建原始语言字幕文件名
159
+ original_lang = info.get('original_lang') or 'en'
160
+ expected_files = [
161
+ f"{video_id}.{original_lang}.vtt",
162
+ f"{video_id}.{original_lang}.srt",
163
+ f"{video_id}.{original_lang}.json3"
164
+ ]
165
 
166
+ # 检查本地是否已有字幕文件
167
+ existing_subs = [f for f in expected_files if os.path.exists(f)]
168
 
169
+ if not existing_subs:
170
+ # 精准下载单个字幕文件
171
+ ydl.params.update({'writesubtitles': True})
172
+ ydl.download([youtube_url])
173
+
174
+ # 重新检查下载结果
175
+ existing_subs = [f for f in expected_files if os.path.exists(f)]
 
 
 
176
 
177
+ if existing_subs:
178
+ with open(existing_subs[0], 'r', encoding='utf-8') as f:
179
  content = f.read()
180
 
181
+ # 基础清理(保持原格式)
182
  clean_content = re.sub(
183
+ r'^\d+\n\d{2}:\d{2}:\d{2}[\.,]\d{3} --> \d{2}:\d{2}:\d{2}[\.,]\d{3}\n',
184
+ '',
185
+ content,
186
+ flags=re.MULTILINE
187
  ).strip()
188
 
189
  return {
190
  "transcript": clean_content,
191
+ "language": original_lang,
192
+ "source": "original"
193
  }
194
+
195
+ return {
196
+ "error": "无原始语言字幕",
197
+ "available": bool(info.get('subtitles'))
198
+ }
199
 
200
  except Exception as e:
201
+ return {"error": f"处理失败: {str(e)}"}
202
 
203
 
204