Chrunos commited on
Commit
3d3b80f
·
verified ·
1 Parent(s): 942ffc3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -66
app.py CHANGED
@@ -134,16 +134,15 @@ async def get_transcript(youtube_url: str):
134
  'skip_download': True,
135
  'writesubtitles': True,
136
  'writeautomaticsub': True,
137
- # 扩展支持的语言列表(包含中英文)
138
- 'subtitleslangs': ['en', 'zh', 'zh-Hans', 'zh-Hant', 'zh-CN', 'zh-TW', 'a.*'],
139
- # 自动选择最佳字幕格式
140
  'subtitlesformat': 'best',
141
- # 包含原始语言代码在文件名中
142
- 'outtmpl': '%(id)s.%(ext)s',
143
  'noplaylist': True,
144
  'cookiefile': "firefox-cookies.txt",
145
- # 添加中文字幕兼容性参数
146
- 'compat_opts': {'no-youtube-unavailable-videos'},
 
147
  }
148
 
149
  env_to_cookies_from_env("firefox-cookies.txt")
@@ -152,79 +151,56 @@ async def get_transcript(youtube_url: str):
152
  info = ydl.extract_info(youtube_url, download=True)
153
  video_id = info['id']
154
 
155
- # 获取实际可用的字幕语言列表
156
- available_subs = []
157
- for sub_type in ['subtitles', 'automatic_captions']:
158
- subs = info.get(sub_type, {})
159
- available_subs.extend([
160
- f"{lang}.{track[0]['ext']}"
161
- for lang, track in subs.items()
162
- ])
163
 
164
- # 优先顺序:中文 > 英文 > 其他语言
165
- priority_langs = ['zh', 'en']
 
 
 
 
 
 
 
 
 
 
166
  subtitle_files = sorted(
167
- [f for f in os.listdir('.') if f.startswith(video_id)],
168
- key=lambda x: (
169
- -max([x.find(lang) for lang in priority_langs]),
170
- len(x)
 
171
  )
172
  )
173
 
174
- logger.info(f"Detected subtitle files: {subtitle_files}")
175
-
176
  if subtitle_files:
177
- subtitle_file = subtitle_files[0]
178
- logger.info(f"Selected subtitle: {subtitle_file}")
179
-
180
- with open(subtitle_file, 'r', encoding='utf-8') as f:
181
  content = f.read()
182
 
183
- # 统一处理不同字幕格式
184
- if subtitle_file.endswith('.json3'):
185
- subs = json.loads(content)
186
- text = ''.join(
187
- e['segs'][0]['utf8']
188
- for e in subs['events']
189
- if e.get('segs')
190
- )
191
- elif subtitle_file.endswith(('.vtt', '.srt')):
192
- text = re.sub(
193
- r'\d{2}:\d{2}:\d{2}[\.,]\d{3}.*?\n',
194
- '',
195
- content
196
- )
197
- text = re.sub(r'<.*?>|{.*?}', '', text)
198
- else:
199
- text = content # 原始内容
200
-
201
- # 中文标点标准化
202
- text = text.translate(str.maketrans({
203
- ',': ',',
204
- '。': '.',
205
- ';': ';',
206
- '!': '!',
207
- '?': '?',
208
- '“': '"',
209
- '”': '"',
210
- '‘': "'",
211
- '’': "'"
212
- }))
213
 
214
  return {
215
- "transcript": text,
216
- "detected_language": subtitle_file.split('.')[-2],
217
- "available_languages": available_subs
218
  }
219
 
220
- return {
221
- "error": "No subtitles found",
222
- "available_languages": available_subs
223
- }
224
 
225
  except Exception as e:
226
- logger.error(f"Error: {str(e)}", exc_info=True)
227
- raise HTTPException(status_code=500, detail=str(e))
228
 
229
 
230
 
 
134
  'skip_download': True,
135
  'writesubtitles': True,
136
  'writeautomaticsub': True,
137
+ # 同时支持中英文及变体
138
+ 'subtitleslangs': ['zh*', 'en*'], # 包含所有中文和英语变体
 
139
  'subtitlesformat': 'best',
140
+ 'outtmpl': '%(id)s.%(ext)s',
 
141
  'noplaylist': True,
142
  'cookiefile': "firefox-cookies.txt",
143
+ 'retries': 3,
144
+ 'sleep_interval': 10, # 增加请求间隔
145
+ 'ignoreerrors': True,
146
  }
147
 
148
  env_to_cookies_from_env("firefox-cookies.txt")
 
151
  info = ydl.extract_info(youtube_url, download=True)
152
  video_id = info['id']
153
 
154
+ # 语言优先级列表(中文 > 英文 > 其他)
155
+ priority_order = [
156
+ 'zh', 'zh-Hans', 'zh-Hant', 'zh-CN', 'zh-TW', # 中文
157
+ 'en', 'en-US', 'en-GB', # 英文
158
+ ]
 
 
 
159
 
160
+ # 获取所有字幕文件并按优先级排序
161
+ subtitle_files = []
162
+ for f in os.listdir('.'):
163
+ if f.startswith(video_id):
164
+ lang_part = f.split('.')[-2]
165
+ # 检查是否包含优先语言代码
166
+ for lang in priority_order:
167
+ if lang in lang_part:
168
+ subtitle_files.append(f)
169
+ break
170
+
171
+ # 按优先级排序文件
172
  subtitle_files = sorted(
173
+ subtitle_files,
174
+ key=lambda x: min(
175
+ priority_order.index(lang)
176
+ for lang in priority_order
177
+ if lang in x
178
  )
179
  )
180
 
 
 
181
  if subtitle_files:
182
+ selected_file = subtitle_files[0]
183
+ with open(selected_file, 'r', encoding='utf-8') as f:
 
 
184
  content = f.read()
185
 
186
+ # 统一清理逻辑
187
+ clean_text = re.sub(
188
+ r'\d{2}:\d{2}:\d{2}[\.\,]\d{3}.*?(\n|$)|'
189
+ r'<.*?>|{.*?}|^WEBVTT$',
190
+ '',
191
+ content,
192
+ flags=re.MULTILINE
193
+ ).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
  return {
196
+ "transcript": clean_text,
197
+ "detected_lang": selected_file.split('.')[-2]
 
198
  }
199
 
200
+ return {"error": "No subtitles found"}
 
 
 
201
 
202
  except Exception as e:
203
+ return {"error": str(e)}
 
204
 
205
 
206