Update app.py
Browse files
app.py
CHANGED
@@ -134,14 +134,20 @@ async def get_transcript(youtube_url: str):
|
|
134 |
'skip_download': True,
|
135 |
'writesubtitles': True,
|
136 |
'writeautomaticsub': True,
|
137 |
-
#
|
138 |
-
'subtitleslangs': [
|
|
|
|
|
|
|
|
|
|
|
139 |
'subtitlesformat': 'best',
|
140 |
'outtmpl': '%(id)s.%(ext)s',
|
141 |
'noplaylist': True,
|
142 |
'cookiefile': "firefox-cookies.txt",
|
143 |
-
|
144 |
-
'
|
|
|
145 |
'ignoreerrors': True,
|
146 |
}
|
147 |
|
@@ -151,56 +157,72 @@ async def get_transcript(youtube_url: str):
|
|
151 |
info = ydl.extract_info(youtube_url, download=True)
|
152 |
video_id = info['id']
|
153 |
|
154 |
-
#
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
|
160 |
-
|
161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
for f in os.listdir('.'):
|
163 |
if f.startswith(video_id):
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
for lang in priority_order
|
177 |
-
if lang in x
|
178 |
-
)
|
179 |
-
)
|
180 |
-
|
181 |
-
if subtitle_files:
|
182 |
-
selected_file = subtitle_files[0]
|
183 |
-
with open(selected_file, 'r', encoding='utf-8') as f:
|
184 |
content = f.read()
|
185 |
|
186 |
-
#
|
187 |
clean_text = re.sub(
|
188 |
-
r'\d{2}:\d{2}:\d{2}[
|
189 |
-
r'
|
|
|
190 |
'',
|
191 |
content,
|
192 |
flags=re.MULTILINE
|
193 |
).strip()
|
194 |
|
|
|
|
|
|
|
195 |
return {
|
196 |
"transcript": clean_text,
|
197 |
-
"detected_lang":
|
|
|
198 |
}
|
199 |
|
200 |
-
return {
|
|
|
|
|
|
|
201 |
|
202 |
except Exception as e:
|
203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
|
205 |
|
206 |
|
|
|
134 |
'skip_download': True,
|
135 |
'writesubtitles': True,
|
136 |
'writeautomaticsub': True,
|
137 |
+
# 精确指定中英文语言代码
|
138 |
+
'subtitleslangs': [
|
139 |
+
'zh', 'zh-Hans', 'zh-Hant', 'zh-CN', 'zh-TW', # 中文
|
140 |
+
'en', 'en-US', 'en-GB', 'en-AU' # 英文
|
141 |
+
],
|
142 |
+
# 允许自动生成字幕作为后备
|
143 |
+
'subtitleslangs_automatic': True,
|
144 |
'subtitlesformat': 'best',
|
145 |
'outtmpl': '%(id)s.%(ext)s',
|
146 |
'noplaylist': True,
|
147 |
'cookiefile': "firefox-cookies.txt",
|
148 |
+
# 优化网络请求参数
|
149 |
+
'retries': 5,
|
150 |
+
'sleep_interval': 15,
|
151 |
'ignoreerrors': True,
|
152 |
}
|
153 |
|
|
|
157 |
info = ydl.extract_info(youtube_url, download=True)
|
158 |
video_id = info['id']
|
159 |
|
160 |
+
# 获取实际可用的字幕语言
|
161 |
+
available_langs = []
|
162 |
+
for sub_type in ['subtitles', 'automatic_captions']:
|
163 |
+
subs = info.get(sub_type, {})
|
164 |
+
available_langs.extend(subs.keys())
|
165 |
|
166 |
+
logger.info(f"Available subtitle languages: {list(set(available_langs))}")
|
167 |
+
|
168 |
+
# 构建优先级列表(中文 > 英文 > 其他)
|
169 |
+
priority_map = {
|
170 |
+
'zh': 0, 'zh-Hans': 1, 'zh-Hant': 2, 'zh-CN': 3, 'zh-TW': 4,
|
171 |
+
'en': 5, 'en-US': 6, 'en-GB': 7, 'en-AU': 8
|
172 |
+
}
|
173 |
+
|
174 |
+
# 查找最佳匹配字幕文件
|
175 |
+
best_sub = None
|
176 |
for f in os.listdir('.'):
|
177 |
if f.startswith(video_id):
|
178 |
+
parts = f.split('.')
|
179 |
+
if len(parts) >= 3:
|
180 |
+
lang_code = parts[-2]
|
181 |
+
# 处理复合语言代码(如 en-US)
|
182 |
+
base_lang = lang_code.split('-')[0]
|
183 |
+
priority = priority_map.get(base_lang, 99)
|
184 |
+
if not best_sub or priority < best_sub[1]:
|
185 |
+
best_sub = (f, priority)
|
186 |
+
|
187 |
+
if best_sub:
|
188 |
+
subtitle_file = best_sub[0]
|
189 |
+
with open(subtitle_file, 'r', encoding='utf-8') as f:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
content = f.read()
|
191 |
|
192 |
+
# 增强型文本清理
|
193 |
clean_text = re.sub(
|
194 |
+
r'\d{2}:\d{2}:\d{2}[,.]\d{3}.*?(\n|$)|' # 时间戳
|
195 |
+
r'<[^>]+>|{[^}]+}|^\s*WEBVTT\s*$|' # HTML/样式标签
|
196 |
+
r'^\d+\s*$', # 序号行
|
197 |
'',
|
198 |
content,
|
199 |
flags=re.MULTILINE
|
200 |
).strip()
|
201 |
|
202 |
+
# 合并重复的空行
|
203 |
+
clean_text = re.sub(r'\n{3,}', '\n\n', clean_text)
|
204 |
+
|
205 |
return {
|
206 |
"transcript": clean_text,
|
207 |
+
"detected_lang": subtitle_file.split('.')[-2],
|
208 |
+
"available_languages": available_langs
|
209 |
}
|
210 |
|
211 |
+
return {
|
212 |
+
"error": "No subtitles available",
|
213 |
+
"available_languages": available_langs
|
214 |
+
}
|
215 |
|
216 |
except Exception as e:
|
217 |
+
logger.error(f"Error: {str(e)}")
|
218 |
+
return {
|
219 |
+
"error": f"Processing failed: {str(e)}",
|
220 |
+
"advice": [
|
221 |
+
"Try adding '&tlang=en' to URL for translated subs",
|
222 |
+
"Check if cookies are still valid",
|
223 |
+
"Reduce request frequency if seeing 429 errors"
|
224 |
+
]
|
225 |
+
}
|
226 |
|
227 |
|
228 |
|