Update app.py
Browse files
app.py
CHANGED
@@ -134,16 +134,15 @@ async def get_transcript(youtube_url: str):
|
|
134 |
'skip_download': True,
|
135 |
'writesubtitles': True,
|
136 |
'writeautomaticsub': True,
|
137 |
-
#
|
138 |
-
'subtitleslangs': ['
|
139 |
-
# 自动选择最佳字幕格式
|
140 |
'subtitlesformat': 'best',
|
141 |
-
|
142 |
-
'outtmpl': '%(id)s.%(ext)s',
|
143 |
'noplaylist': True,
|
144 |
'cookiefile': "firefox-cookies.txt",
|
145 |
-
|
146 |
-
'
|
|
|
147 |
}
|
148 |
|
149 |
env_to_cookies_from_env("firefox-cookies.txt")
|
@@ -152,79 +151,56 @@ async def get_transcript(youtube_url: str):
|
|
152 |
info = ydl.extract_info(youtube_url, download=True)
|
153 |
video_id = info['id']
|
154 |
|
155 |
-
#
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
f"{lang}.{track[0]['ext']}"
|
161 |
-
for lang, track in subs.items()
|
162 |
-
])
|
163 |
|
164 |
-
#
|
165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
subtitle_files = sorted(
|
167 |
-
|
168 |
-
key=lambda x: (
|
169 |
-
|
170 |
-
|
|
|
171 |
)
|
172 |
)
|
173 |
|
174 |
-
logger.info(f"Detected subtitle files: {subtitle_files}")
|
175 |
-
|
176 |
if subtitle_files:
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
with open(subtitle_file, 'r', encoding='utf-8') as f:
|
181 |
content = f.read()
|
182 |
|
183 |
-
#
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
elif subtitle_file.endswith(('.vtt', '.srt')):
|
192 |
-
text = re.sub(
|
193 |
-
r'\d{2}:\d{2}:\d{2}[\.,]\d{3}.*?\n',
|
194 |
-
'',
|
195 |
-
content
|
196 |
-
)
|
197 |
-
text = re.sub(r'<.*?>|{.*?}', '', text)
|
198 |
-
else:
|
199 |
-
text = content # 原始内容
|
200 |
-
|
201 |
-
# 中文标点标准化
|
202 |
-
text = text.translate(str.maketrans({
|
203 |
-
',': ',',
|
204 |
-
'。': '.',
|
205 |
-
';': ';',
|
206 |
-
'!': '!',
|
207 |
-
'?': '?',
|
208 |
-
'“': '"',
|
209 |
-
'”': '"',
|
210 |
-
'‘': "'",
|
211 |
-
'’': "'"
|
212 |
-
}))
|
213 |
|
214 |
return {
|
215 |
-
"transcript":
|
216 |
-
"
|
217 |
-
"available_languages": available_subs
|
218 |
}
|
219 |
|
220 |
-
return {
|
221 |
-
"error": "No subtitles found",
|
222 |
-
"available_languages": available_subs
|
223 |
-
}
|
224 |
|
225 |
except Exception as e:
|
226 |
-
|
227 |
-
raise HTTPException(status_code=500, detail=str(e))
|
228 |
|
229 |
|
230 |
|
|
|
134 |
'skip_download': True,
|
135 |
'writesubtitles': True,
|
136 |
'writeautomaticsub': True,
|
137 |
+
# 同时支持中英文及变体
|
138 |
+
'subtitleslangs': ['zh*', 'en*'], # 包含所有中文和英语变体
|
|
|
139 |
'subtitlesformat': 'best',
|
140 |
+
'outtmpl': '%(id)s.%(ext)s',
|
|
|
141 |
'noplaylist': True,
|
142 |
'cookiefile': "firefox-cookies.txt",
|
143 |
+
'retries': 3,
|
144 |
+
'sleep_interval': 10, # 增加请求间隔
|
145 |
+
'ignoreerrors': True,
|
146 |
}
|
147 |
|
148 |
env_to_cookies_from_env("firefox-cookies.txt")
|
|
|
151 |
info = ydl.extract_info(youtube_url, download=True)
|
152 |
video_id = info['id']
|
153 |
|
154 |
+
# 语言优先级列表(中文 > 英文 > 其他)
|
155 |
+
priority_order = [
|
156 |
+
'zh', 'zh-Hans', 'zh-Hant', 'zh-CN', 'zh-TW', # 中文
|
157 |
+
'en', 'en-US', 'en-GB', # 英文
|
158 |
+
]
|
|
|
|
|
|
|
159 |
|
160 |
+
# 获取所有字幕文件并按优先级排序
|
161 |
+
subtitle_files = []
|
162 |
+
for f in os.listdir('.'):
|
163 |
+
if f.startswith(video_id):
|
164 |
+
lang_part = f.split('.')[-2]
|
165 |
+
# 检查是否包含优先语言代码
|
166 |
+
for lang in priority_order:
|
167 |
+
if lang in lang_part:
|
168 |
+
subtitle_files.append(f)
|
169 |
+
break
|
170 |
+
|
171 |
+
# 按优先级排序文件
|
172 |
subtitle_files = sorted(
|
173 |
+
subtitle_files,
|
174 |
+
key=lambda x: min(
|
175 |
+
priority_order.index(lang)
|
176 |
+
for lang in priority_order
|
177 |
+
if lang in x
|
178 |
)
|
179 |
)
|
180 |
|
|
|
|
|
181 |
if subtitle_files:
|
182 |
+
selected_file = subtitle_files[0]
|
183 |
+
with open(selected_file, 'r', encoding='utf-8') as f:
|
|
|
|
|
184 |
content = f.read()
|
185 |
|
186 |
+
# 统一清理逻辑
|
187 |
+
clean_text = re.sub(
|
188 |
+
r'\d{2}:\d{2}:\d{2}[\.\,]\d{3}.*?(\n|$)|'
|
189 |
+
r'<.*?>|{.*?}|^WEBVTT$',
|
190 |
+
'',
|
191 |
+
content,
|
192 |
+
flags=re.MULTILINE
|
193 |
+
).strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
|
195 |
return {
|
196 |
+
"transcript": clean_text,
|
197 |
+
"detected_lang": selected_file.split('.')[-2]
|
|
|
198 |
}
|
199 |
|
200 |
+
return {"error": "No subtitles found"}
|
|
|
|
|
|
|
201 |
|
202 |
except Exception as e:
|
203 |
+
return {"error": str(e)}
|
|
|
204 |
|
205 |
|
206 |
|