Update app.py
Browse files
app.py
CHANGED
@@ -132,16 +132,15 @@ async def get_transcript(youtube_url: str):
|
|
132 |
try:
|
133 |
ydl_opts = {
|
134 |
'skip_download': True,
|
135 |
-
'writesubtitles': True,
|
136 |
-
'writeautomaticsub':
|
137 |
-
'subtitleslangs': ['original'], #
|
138 |
'subtitlesformat': 'best',
|
139 |
'outtmpl': '%(id)s.%(ext)s',
|
140 |
'noplaylist': True,
|
141 |
'cookiefile': "firefox-cookies.txt",
|
142 |
-
#
|
143 |
-
'
|
144 |
-
'compat_opts': {'no-sub-translate'},
|
145 |
}
|
146 |
|
147 |
env_to_cookies_from_env("firefox-cookies.txt")
|
@@ -150,35 +149,55 @@ async def get_transcript(youtube_url: str):
|
|
150 |
info = ydl.extract_info(youtube_url, download=True)
|
151 |
video_id = info['id']
|
152 |
|
153 |
-
#
|
154 |
-
original_lang = info.get('original_lang')
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
|
|
|
|
|
|
160 |
|
161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
|
163 |
if subtitle_files:
|
164 |
-
#
|
165 |
-
|
166 |
-
|
|
|
|
|
167 |
content = f.read()
|
168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
return {
|
170 |
-
"transcript":
|
171 |
"language": original_lang,
|
172 |
-
"
|
173 |
}
|
174 |
-
|
175 |
return {
|
176 |
-
"error": "
|
177 |
-
"available_langs": list(info.get('subtitles', {}).keys())
|
|
|
178 |
}
|
179 |
|
180 |
except Exception as e:
|
181 |
-
return {"error": str(e)}
|
182 |
|
183 |
|
184 |
|
|
|
132 |
try:
|
133 |
ydl_opts = {
|
134 |
'skip_download': True,
|
135 |
+
'writesubtitles': True,
|
136 |
+
'writeautomaticsub': True, # 启用自动字幕作为备选
|
137 |
+
'subtitleslangs': ['original'], # 优先原始语言
|
138 |
'subtitlesformat': 'best',
|
139 |
'outtmpl': '%(id)s.%(ext)s',
|
140 |
'noplaylist': True,
|
141 |
'cookiefile': "firefox-cookies.txt",
|
142 |
+
'compat_opts': {'no-sub-translate'}, # 保持原始语言
|
143 |
+
'retries': 3,
|
|
|
144 |
}
|
145 |
|
146 |
env_to_cookies_from_env("firefox-cookies.txt")
|
|
|
149 |
info = ydl.extract_info(youtube_url, download=True)
|
150 |
video_id = info['id']
|
151 |
|
152 |
+
# 精确获取原始语言(修复YouTube API返回不一致问题)
|
153 |
+
original_lang = info.get('original_lang')
|
154 |
+
if not original_lang:
|
155 |
+
original_lang = info.get('subtitles', {}).get('original', [{}])[0].get('language')
|
156 |
+
original_lang = original_lang or 'en'
|
157 |
+
|
158 |
+
# 扩展匹配模式(包含自动生成字幕)
|
159 |
+
subtitle_pattern = re.compile(
|
160 |
+
rf"^{re.escape(video_id)}\.(a\.)?{re.escape(original_lang)}\..+$"
|
161 |
+
)
|
162 |
|
163 |
+
# 查找所有可能匹配的文件
|
164 |
+
subtitle_files = [
|
165 |
+
f for f in os.listdir('.')
|
166 |
+
if subtitle_pattern.match(f)
|
167 |
+
]
|
168 |
+
|
169 |
+
logger.info(f"匹配到的字幕文件: {subtitle_files}")
|
170 |
|
171 |
if subtitle_files:
|
172 |
+
# 优先手动上传的字幕(非自动生成)
|
173 |
+
preferred_files = [f for f in subtitle_files if 'a.' not in f]
|
174 |
+
selected_file = preferred_files[0] if preferred_files else subtitle_files[0]
|
175 |
+
|
176 |
+
with open(selected_file, 'r', encoding='utf-8') as f:
|
177 |
content = f.read()
|
178 |
+
|
179 |
+
# 清理字幕内容
|
180 |
+
clean_content = re.sub(
|
181 |
+
r'^\d+\n\d{2}:\d{2}:\d{2}[\.,]\d{3} --> \d{2}:\d{2}:\d{2}[\.,]\d{3}\n',
|
182 |
+
'',
|
183 |
+
content,
|
184 |
+
flags=re.MULTILINE
|
185 |
+
).strip()
|
186 |
+
|
187 |
return {
|
188 |
+
"transcript": clean_content,
|
189 |
"language": original_lang,
|
190 |
+
"sub_type": "manual" if 'a.' not in selected_file else "auto-generated"
|
191 |
}
|
192 |
+
|
193 |
return {
|
194 |
+
"error": "未找到原始语言字幕",
|
195 |
+
"available_langs": list(info.get('subtitles', {}).keys())
|
196 |
+
+ list(info.get('automatic_captions', {}).keys())
|
197 |
}
|
198 |
|
199 |
except Exception as e:
|
200 |
+
return {"error": f"处理失败: {str(e)}"}
|
201 |
|
202 |
|
203 |
|