Update app.py
Browse files
app.py
CHANGED
@@ -133,14 +133,13 @@ async def get_transcript(youtube_url: str):
|
|
133 |
ydl_opts = {
|
134 |
'skip_download': True,
|
135 |
'writesubtitles': True,
|
136 |
-
'writeautomaticsub': True,
|
137 |
-
'subtitleslangs': ['
|
138 |
'subtitlesformat': 'best',
|
139 |
'outtmpl': '%(id)s.%(ext)s',
|
140 |
'noplaylist': True,
|
141 |
'cookiefile': "firefox-cookies.txt",
|
142 |
-
'
|
143 |
-
'retries': 3,
|
144 |
}
|
145 |
|
146 |
env_to_cookies_from_env("firefox-cookies.txt")
|
@@ -149,55 +148,43 @@ async def get_transcript(youtube_url: str):
|
|
149 |
info = ydl.extract_info(youtube_url, download=True)
|
150 |
video_id = info['id']
|
151 |
|
152 |
-
#
|
153 |
-
|
154 |
-
if not original_lang:
|
155 |
-
original_lang = info.get('subtitles', {}).get('original', [{}])[0].get('language')
|
156 |
-
original_lang = original_lang or 'en'
|
157 |
-
|
158 |
-
# 扩展匹配模式(包含自动生成字幕)
|
159 |
-
subtitle_pattern = re.compile(
|
160 |
-
rf"^{re.escape(video_id)}\.(a\.)?{re.escape(original_lang)}\..+$"
|
161 |
-
)
|
162 |
|
163 |
-
#
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
|
|
|
|
177 |
content = f.read()
|
178 |
-
|
179 |
-
#
|
180 |
clean_content = re.sub(
|
181 |
-
r'
|
182 |
'',
|
183 |
-
content
|
184 |
-
flags=re.MULTILINE
|
185 |
).strip()
|
186 |
-
|
187 |
return {
|
188 |
"transcript": clean_content,
|
189 |
-
"
|
190 |
-
"sub_type": "manual" if 'a.' not in selected_file else "auto-generated"
|
191 |
}
|
192 |
|
193 |
-
return {
|
194 |
-
"error": "未找到原始语言字幕",
|
195 |
-
"available_langs": list(info.get('subtitles', {}).keys())
|
196 |
-
+ list(info.get('automatic_captions', {}).keys())
|
197 |
-
}
|
198 |
|
199 |
except Exception as e:
|
200 |
-
return {"error":
|
201 |
|
202 |
|
203 |
|
|
|
133 |
ydl_opts = {
|
134 |
'skip_download': True,
|
135 |
'writesubtitles': True,
|
136 |
+
'writeautomaticsub': True,
|
137 |
+
'subtitleslangs': ['all'], # 获取所有字幕
|
138 |
'subtitlesformat': 'best',
|
139 |
'outtmpl': '%(id)s.%(ext)s',
|
140 |
'noplaylist': True,
|
141 |
'cookiefile': "firefox-cookies.txt",
|
142 |
+
'ignoreerrors': True,
|
|
|
143 |
}
|
144 |
|
145 |
env_to_cookies_from_env("firefox-cookies.txt")
|
|
|
148 |
info = ydl.extract_info(youtube_url, download=True)
|
149 |
video_id = info['id']
|
150 |
|
151 |
+
# 合并手动和自动字幕列表
|
152 |
+
all_subs = list(info.get('subtitles', {}).keys()) + list(info.get('automatic_captions', {}).keys())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
|
154 |
+
# 过滤有效语言代码(排除翻译字幕如en-zh-Hant)
|
155 |
+
valid_langs = [lang for lang in all_subs if '-' not in lang.split('.')[-1]]
|
156 |
+
|
157 |
+
if not valid_langs:
|
158 |
+
return {"error": "无可用字幕"}
|
159 |
+
|
160 |
+
# 直接取第一个有效语言
|
161 |
+
target_lang = valid_langs[0]
|
162 |
+
|
163 |
+
# 构建字幕文件名模式
|
164 |
+
subtitle_file = f"{video_id}.{target_lang}.vtt" # 优先vtt格式
|
165 |
+
if not os.path.exists(subtitle_file):
|
166 |
+
subtitle_file = f"{video_id}.{target_lang}.srt"
|
167 |
+
|
168 |
+
if os.path.exists(subtitle_file):
|
169 |
+
with open(subtitle_file, 'r', encoding='utf-8') as f:
|
170 |
content = f.read()
|
171 |
+
|
172 |
+
# 基础清理
|
173 |
clean_content = re.sub(
|
174 |
+
r'\d{2}:\d{2}:\d{2}[,.]\d{3}.*?\n',
|
175 |
'',
|
176 |
+
content
|
|
|
177 |
).strip()
|
178 |
+
|
179 |
return {
|
180 |
"transcript": clean_content,
|
181 |
+
"detected_lang": target_lang
|
|
|
182 |
}
|
183 |
|
184 |
+
return {"error": "字幕文件未找到"}
|
|
|
|
|
|
|
|
|
185 |
|
186 |
except Exception as e:
|
187 |
+
return {"error": str(e)}
|
188 |
|
189 |
|
190 |
|