Update app.py
Browse files
app.py
CHANGED
@@ -132,59 +132,73 @@ async def get_transcript(youtube_url: str):
|
|
132 |
try:
|
133 |
ydl_opts = {
|
134 |
'skip_download': True,
|
135 |
-
'writesubtitles': True,
|
136 |
-
'writeautomaticsub':
|
137 |
-
'subtitleslangs': ['
|
138 |
'subtitlesformat': 'best',
|
139 |
'outtmpl': '%(id)s.%(ext)s',
|
140 |
'noplaylist': True,
|
141 |
'cookiefile': "firefox-cookies.txt",
|
142 |
'ignoreerrors': True,
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
}
|
144 |
|
145 |
env_to_cookies_from_env("firefox-cookies.txt")
|
146 |
|
147 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
148 |
-
|
|
|
149 |
video_id = info['id']
|
150 |
|
151 |
-
#
|
152 |
-
|
|
|
|
|
|
|
|
|
|
|
153 |
|
154 |
-
#
|
155 |
-
|
156 |
|
157 |
-
if not
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
subtitle_file = f"{video_id}.{target_lang}.vtt" # 优先vtt格式
|
165 |
-
if not os.path.exists(subtitle_file):
|
166 |
-
subtitle_file = f"{video_id}.{target_lang}.srt"
|
167 |
|
168 |
-
if
|
169 |
-
with open(
|
170 |
content = f.read()
|
171 |
|
172 |
-
#
|
173 |
clean_content = re.sub(
|
174 |
-
r'\d{2}:\d{2}:\d{2}[
|
175 |
-
'',
|
176 |
-
content
|
|
|
177 |
).strip()
|
178 |
|
179 |
return {
|
180 |
"transcript": clean_content,
|
181 |
-
"
|
|
|
182 |
}
|
183 |
-
|
184 |
-
return {
|
|
|
|
|
|
|
185 |
|
186 |
except Exception as e:
|
187 |
-
return {"error": str(e)}
|
188 |
|
189 |
|
190 |
|
|
|
132 |
try:
|
133 |
ydl_opts = {
|
134 |
'skip_download': True,
|
135 |
+
'writesubtitles': True, # 仅下载手动上传的字幕
|
136 |
+
'writeautomaticsub': False, # 禁用自动生成字幕
|
137 |
+
'subtitleslangs': ['orig'], # 关键参数:只获取原始语言
|
138 |
'subtitlesformat': 'best',
|
139 |
'outtmpl': '%(id)s.%(ext)s',
|
140 |
'noplaylist': True,
|
141 |
'cookiefile': "firefox-cookies.txt",
|
142 |
'ignoreerrors': True,
|
143 |
+
# 精准控制参数
|
144 |
+
'postprocessors': [],
|
145 |
+
'compat_opts': [
|
146 |
+
'no-youtube-unavailable-videos',
|
147 |
+
'no-sub-translate'
|
148 |
+
],
|
149 |
}
|
150 |
|
151 |
env_to_cookies_from_env("firefox-cookies.txt")
|
152 |
|
153 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
154 |
+
# 精确控制下载行为
|
155 |
+
info = ydl.extract_info(youtube_url, download=False) # 先获取元数据
|
156 |
video_id = info['id']
|
157 |
|
158 |
+
# 直接构建原始语言字幕文件名
|
159 |
+
original_lang = info.get('original_lang') or 'en'
|
160 |
+
expected_files = [
|
161 |
+
f"{video_id}.{original_lang}.vtt",
|
162 |
+
f"{video_id}.{original_lang}.srt",
|
163 |
+
f"{video_id}.{original_lang}.json3"
|
164 |
+
]
|
165 |
|
166 |
+
# 检查本地是否已有字幕文件
|
167 |
+
existing_subs = [f for f in expected_files if os.path.exists(f)]
|
168 |
|
169 |
+
if not existing_subs:
|
170 |
+
# 精准下载单个字幕文件
|
171 |
+
ydl.params.update({'writesubtitles': True})
|
172 |
+
ydl.download([youtube_url])
|
173 |
+
|
174 |
+
# 重新检查下载结果
|
175 |
+
existing_subs = [f for f in expected_files if os.path.exists(f)]
|
|
|
|
|
|
|
176 |
|
177 |
+
if existing_subs:
|
178 |
+
with open(existing_subs[0], 'r', encoding='utf-8') as f:
|
179 |
content = f.read()
|
180 |
|
181 |
+
# 基础清理(保持原格式)
|
182 |
clean_content = re.sub(
|
183 |
+
r'^\d+\n\d{2}:\d{2}:\d{2}[\.,]\d{3} --> \d{2}:\d{2}:\d{2}[\.,]\d{3}\n',
|
184 |
+
'',
|
185 |
+
content,
|
186 |
+
flags=re.MULTILINE
|
187 |
).strip()
|
188 |
|
189 |
return {
|
190 |
"transcript": clean_content,
|
191 |
+
"language": original_lang,
|
192 |
+
"source": "original"
|
193 |
}
|
194 |
+
|
195 |
+
return {
|
196 |
+
"error": "无原始语言字幕",
|
197 |
+
"available": bool(info.get('subtitles'))
|
198 |
+
}
|
199 |
|
200 |
except Exception as e:
|
201 |
+
return {"error": f"处理失败: {str(e)}"}
|
202 |
|
203 |
|
204 |
|