Update app.py
Browse files
app.py
CHANGED
@@ -132,23 +132,16 @@ async def get_transcript(youtube_url: str):
|
|
132 |
try:
|
133 |
ydl_opts = {
|
134 |
'skip_download': True,
|
135 |
-
'writesubtitles': True,
|
136 |
-
'writeautomaticsub':
|
137 |
-
#
|
138 |
-
'subtitleslangs': [
|
139 |
-
'zh', 'zh-Hans', 'zh-Hant', 'zh-CN', 'zh-TW', # 中文
|
140 |
-
'en', 'en-US', 'en-GB', 'en-AU' # 英文
|
141 |
-
],
|
142 |
-
# 允许自动生成字幕作为后备
|
143 |
-
'subtitleslangs_automatic': True,
|
144 |
'subtitlesformat': 'best',
|
145 |
'outtmpl': '%(id)s.%(ext)s',
|
146 |
'noplaylist': True,
|
147 |
'cookiefile': "firefox-cookies.txt",
|
148 |
-
#
|
149 |
-
'
|
150 |
-
'
|
151 |
-
'ignoreerrors': True,
|
152 |
}
|
153 |
|
154 |
env_to_cookies_from_env("firefox-cookies.txt")
|
@@ -157,72 +150,35 @@ async def get_transcript(youtube_url: str):
|
|
157 |
info = ydl.extract_info(youtube_url, download=True)
|
158 |
video_id = info['id']
|
159 |
|
160 |
-
#
|
161 |
-
|
162 |
-
for sub_type in ['subtitles', 'automatic_captions']:
|
163 |
-
subs = info.get(sub_type, {})
|
164 |
-
available_langs.extend(subs.keys())
|
165 |
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
'zh': 0, 'zh-Hans': 1, 'zh-Hant': 2, 'zh-CN': 3, 'zh-TW': 4,
|
171 |
-
'en': 5, 'en-US': 6, 'en-GB': 7, 'en-AU': 8
|
172 |
-
}
|
173 |
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
if len(parts) >= 3:
|
180 |
-
lang_code = parts[-2]
|
181 |
-
# 处理复合语言代码(如 en-US)
|
182 |
-
base_lang = lang_code.split('-')[0]
|
183 |
-
priority = priority_map.get(base_lang, 99)
|
184 |
-
if not best_sub or priority < best_sub[1]:
|
185 |
-
best_sub = (f, priority)
|
186 |
-
|
187 |
-
if best_sub:
|
188 |
-
subtitle_file = best_sub[0]
|
189 |
with open(subtitle_file, 'r', encoding='utf-8') as f:
|
190 |
content = f.read()
|
191 |
-
|
192 |
-
# 增强型文本清理
|
193 |
-
clean_text = re.sub(
|
194 |
-
r'\d{2}:\d{2}:\d{2}[,.]\d{3}.*?(\n|$)|' # 时间戳
|
195 |
-
r'<[^>]+>|{[^}]+}|^\s*WEBVTT\s*$|' # HTML/样式标签
|
196 |
-
r'^\d+\s*$', # 序号行
|
197 |
-
'',
|
198 |
-
content,
|
199 |
-
flags=re.MULTILINE
|
200 |
-
).strip()
|
201 |
-
|
202 |
-
# 合并重复的空行
|
203 |
-
clean_text = re.sub(r'\n{3,}', '\n\n', clean_text)
|
204 |
-
|
205 |
return {
|
206 |
-
"transcript":
|
207 |
-
"
|
208 |
-
"
|
209 |
}
|
210 |
-
|
211 |
return {
|
212 |
-
"error": "
|
213 |
-
"
|
214 |
}
|
215 |
|
216 |
except Exception as e:
|
217 |
-
|
218 |
-
return {
|
219 |
-
"error": f"Processing failed: {str(e)}",
|
220 |
-
"advice": [
|
221 |
-
"Try adding '&tlang=en' to URL for translated subs",
|
222 |
-
"Check if cookies are still valid",
|
223 |
-
"Reduce request frequency if seeing 429 errors"
|
224 |
-
]
|
225 |
-
}
|
226 |
|
227 |
|
228 |
|
|
|
132 |
try:
|
133 |
ydl_opts = {
|
134 |
'skip_download': True,
|
135 |
+
'writesubtitles': True, # 仅下载手动上传的字幕
|
136 |
+
'writeautomaticsub': False, # 禁用自动生成字幕
|
137 |
+
'subtitleslangs': ['original'], # 关键参数:只获取原始语言
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
'subtitlesformat': 'best',
|
139 |
'outtmpl': '%(id)s.%(ext)s',
|
140 |
'noplaylist': True,
|
141 |
'cookiefile': "firefox-cookies.txt",
|
142 |
+
# 禁用所有翻译功能
|
143 |
+
'postprocessors': [],
|
144 |
+
'compat_opts': {'no-sub-translate'},
|
|
|
145 |
}
|
146 |
|
147 |
env_to_cookies_from_env("firefox-cookies.txt")
|
|
|
150 |
info = ydl.extract_info(youtube_url, download=True)
|
151 |
video_id = info['id']
|
152 |
|
153 |
+
# 获取原始语言代码(如视频是英语则返回en)
|
154 |
+
original_lang = info.get('original_lang') or 'en'
|
|
|
|
|
|
|
155 |
|
156 |
+
# 查找匹配原始语言的字幕文件
|
157 |
+
subtitle_pattern = f"{video_id}.{original_lang}.*"
|
158 |
+
subtitle_files = [f for f in os.listdir('.')
|
159 |
+
if re.match(subtitle_pattern, f)]
|
|
|
|
|
|
|
160 |
|
161 |
+
logger.info(f"原始语言字幕文件: {subtitle_files}")
|
162 |
+
|
163 |
+
if subtitle_files:
|
164 |
+
# 选择第一个匹配的文件
|
165 |
+
subtitle_file = subtitle_files[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
with open(subtitle_file, 'r', encoding='utf-8') as f:
|
167 |
content = f.read()
|
168 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
return {
|
170 |
+
"transcript": content,
|
171 |
+
"language": original_lang,
|
172 |
+
"is_translated": False
|
173 |
}
|
174 |
+
|
175 |
return {
|
176 |
+
"error": "无原始语言字幕",
|
177 |
+
"available_langs": list(info.get('subtitles', {}).keys())
|
178 |
}
|
179 |
|
180 |
except Exception as e:
|
181 |
+
return {"error": str(e)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
|
183 |
|
184 |
|