Update app.py
Browse files
app.py
CHANGED
@@ -134,54 +134,93 @@ async def get_transcript(youtube_url: str):
|
|
134 |
'skip_download': True,
|
135 |
'writesubtitles': True,
|
136 |
'writeautomaticsub': True,
|
137 |
-
|
|
|
|
|
138 |
'subtitlesformat': 'best',
|
139 |
-
|
|
|
140 |
'noplaylist': True,
|
141 |
-
'cookiefile': "firefox-cookies.txt"
|
|
|
|
|
142 |
}
|
|
|
143 |
env_to_cookies_from_env("firefox-cookies.txt")
|
144 |
-
|
145 |
-
logger.info(f"Current directory files (before): {os.listdir('.')}")
|
146 |
-
|
147 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
148 |
info = ydl.extract_info(youtube_url, download=True)
|
149 |
video_id = info['id']
|
150 |
-
logger.info(f"Video ID: {video_id}")
|
151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
|
153 |
-
#
|
154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
|
156 |
-
|
157 |
-
subtitle_files = [f for f in os.listdir('.')
|
158 |
-
if f.startswith(video_id) and ('en' in f)]
|
159 |
-
logger.info(f"Potential subtitle files: {subtitle_files}")
|
160 |
|
161 |
if subtitle_files:
|
162 |
-
# Process the first found subtitle file
|
163 |
subtitle_file = subtitle_files[0]
|
164 |
-
logger.info(f"
|
165 |
|
166 |
with open(subtitle_file, 'r', encoding='utf-8') as f:
|
167 |
content = f.read()
|
168 |
-
|
169 |
-
#
|
170 |
if subtitle_file.endswith('.json3'):
|
171 |
-
import json
|
172 |
subs = json.loads(content)
|
173 |
-
text = '
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
else:
|
180 |
-
text =
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
|
186 |
except Exception as e:
|
187 |
logger.error(f"Error: {str(e)}", exc_info=True)
|
|
|
134 |
'skip_download': True,
|
135 |
'writesubtitles': True,
|
136 |
'writeautomaticsub': True,
|
137 |
+
# 扩展支持的语言列表(包含中英文)
|
138 |
+
'subtitleslangs': ['en', 'zh', 'zh-Hans', 'zh-Hant', 'zh-CN', 'zh-TW', 'a.*'],
|
139 |
+
# 自动选择最佳字幕格式
|
140 |
'subtitlesformat': 'best',
|
141 |
+
# 包含原始语言代码在文件名中
|
142 |
+
'outtmpl': '%(id)s.%(ext)s',
|
143 |
'noplaylist': True,
|
144 |
+
'cookiefile': "firefox-cookies.txt",
|
145 |
+
# 添加中文字幕兼容性参数
|
146 |
+
'compat_opts': {'no-youtube-unavailable-videos'},
|
147 |
}
|
148 |
+
|
149 |
env_to_cookies_from_env("firefox-cookies.txt")
|
150 |
+
|
|
|
|
|
151 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
152 |
info = ydl.extract_info(youtube_url, download=True)
|
153 |
video_id = info['id']
|
|
|
154 |
|
155 |
+
# 获取实际可用的字幕语言列表
|
156 |
+
available_subs = []
|
157 |
+
for sub_type in ['subtitles', 'automatic_captions']:
|
158 |
+
subs = info.get(sub_type, {})
|
159 |
+
available_subs.extend([
|
160 |
+
f"{lang}.{track[0]['ext']}"
|
161 |
+
for lang, track in subs.items()
|
162 |
+
])
|
163 |
|
164 |
+
# 优先顺序:中文 > 英文 > 其他语言
|
165 |
+
priority_langs = ['zh', 'en']
|
166 |
+
subtitle_files = sorted(
|
167 |
+
[f for f in os.listdir('.') if f.startswith(video_id)],
|
168 |
+
key=lambda x: (
|
169 |
+
-max([x.find(lang) for lang in priority_langs]),
|
170 |
+
len(x)
|
171 |
+
)
|
172 |
+
)
|
173 |
|
174 |
+
logger.info(f"Detected subtitle files: {subtitle_files}")
|
|
|
|
|
|
|
175 |
|
176 |
if subtitle_files:
|
|
|
177 |
subtitle_file = subtitle_files[0]
|
178 |
+
logger.info(f"Selected subtitle: {subtitle_file}")
|
179 |
|
180 |
with open(subtitle_file, 'r', encoding='utf-8') as f:
|
181 |
content = f.read()
|
182 |
+
|
183 |
+
# 统一处理不同字幕格式
|
184 |
if subtitle_file.endswith('.json3'):
|
|
|
185 |
subs = json.loads(content)
|
186 |
+
text = ''.join(
|
187 |
+
e['segs'][0]['utf8']
|
188 |
+
for e in subs['events']
|
189 |
+
if e.get('segs')
|
190 |
+
)
|
191 |
+
elif subtitle_file.endswith(('.vtt', '.srt')):
|
192 |
+
text = re.sub(
|
193 |
+
r'\d{2}:\d{2}:\d{2}[\.,]\d{3}.*?\n',
|
194 |
+
'',
|
195 |
+
content
|
196 |
+
)
|
197 |
+
text = re.sub(r'<.*?>|{.*?}', '', text)
|
198 |
else:
|
199 |
+
text = content # 原始内容
|
200 |
+
|
201 |
+
# 中文标点标准化
|
202 |
+
text = text.translate(str.maketrans({
|
203 |
+
',': ',',
|
204 |
+
'。': '.',
|
205 |
+
';': ';',
|
206 |
+
'!': '!',
|
207 |
+
'?': '?',
|
208 |
+
'“': '"',
|
209 |
+
'”': '"',
|
210 |
+
'‘': "'",
|
211 |
+
'’': "'"
|
212 |
+
}))
|
213 |
+
|
214 |
+
return {
|
215 |
+
"transcript": text,
|
216 |
+
"detected_language": subtitle_file.split('.')[-2],
|
217 |
+
"available_languages": available_subs
|
218 |
+
}
|
219 |
+
|
220 |
+
return {
|
221 |
+
"error": "No subtitles found",
|
222 |
+
"available_languages": available_subs
|
223 |
+
}
|
224 |
|
225 |
except Exception as e:
|
226 |
logger.error(f"Error: {str(e)}", exc_info=True)
|