Update app.py
Browse files
app.py
CHANGED
@@ -128,77 +128,81 @@ async def get_video_url(youtube_url: str):
|
|
128 |
|
129 |
|
130 |
@app.get("/script")
|
131 |
-
async def get_transcript(youtube_url: str):
|
132 |
try:
|
|
|
133 |
ydl_opts = {
|
134 |
'skip_download': True,
|
135 |
-
'writesubtitles': True,
|
136 |
-
'writeautomaticsub':
|
137 |
-
'subtitleslangs': ['
|
138 |
'subtitlesformat': 'best',
|
139 |
'outtmpl': '%(id)s.%(ext)s',
|
140 |
'noplaylist': True,
|
141 |
-
'cookiefile': "firefox-cookies.txt"
|
142 |
-
'ignoreerrors': True,
|
143 |
-
# 精准控制参数
|
144 |
-
'postprocessors': [],
|
145 |
-
'compat_opts': [
|
146 |
-
'no-youtube-unavailable-videos',
|
147 |
-
'no-sub-translate'
|
148 |
-
],
|
149 |
}
|
150 |
-
|
151 |
env_to_cookies_from_env("firefox-cookies.txt")
|
152 |
-
|
|
|
|
|
153 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
154 |
-
|
155 |
-
info = ydl.extract_info(youtube_url, download=False) # 先获取元数据
|
156 |
video_id = info['id']
|
|
|
157 |
|
158 |
-
#
|
159 |
-
|
160 |
-
expected_files = [
|
161 |
-
f"{video_id}.{original_lang}.vtt",
|
162 |
-
f"{video_id}.{original_lang}.srt",
|
163 |
-
f"{video_id}.{original_lang}.json3"
|
164 |
-
]
|
165 |
|
166 |
-
#
|
167 |
-
|
|
|
|
|
|
|
168 |
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
|
|
176 |
|
177 |
-
if
|
178 |
-
|
|
|
|
|
|
|
|
|
179 |
content = f.read()
|
180 |
|
181 |
-
#
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
|
189 |
-
return {
|
190 |
-
"transcript": clean_content,
|
191 |
-
"language": original_lang,
|
192 |
-
"source": "original"
|
193 |
-
}
|
194 |
|
195 |
-
return {
|
196 |
-
"error": "无原始语言字幕",
|
197 |
-
"available": bool(info.get('subtitles'))
|
198 |
-
}
|
199 |
-
|
200 |
except Exception as e:
|
201 |
-
|
|
|
202 |
|
203 |
|
204 |
|
|
|
128 |
|
129 |
|
130 |
@app.get("/script")
|
131 |
+
async def get_transcript(youtube_url: str, language: str = None):
|
132 |
try:
|
133 |
+
# If no specific language is requested, we'll try to get any available subtitle
|
134 |
ydl_opts = {
|
135 |
'skip_download': True,
|
136 |
+
'writesubtitles': True,
|
137 |
+
'writeautomaticsub': True,
|
138 |
+
'subtitleslangs': ['all'] if not language else [language],
|
139 |
'subtitlesformat': 'best',
|
140 |
'outtmpl': '%(id)s.%(ext)s',
|
141 |
'noplaylist': True,
|
142 |
+
'cookiefile': "firefox-cookies.txt"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
}
|
|
|
144 |
env_to_cookies_from_env("firefox-cookies.txt")
|
145 |
+
# Show current directory structure before download
|
146 |
+
logger.info(f"Current directory files (before): {os.listdir('.')}")
|
147 |
+
|
148 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
149 |
+
info = ydl.extract_info(youtube_url, download=True)
|
|
|
150 |
video_id = info['id']
|
151 |
+
logger.info(f"Video ID: {video_id}")
|
152 |
|
153 |
+
# Check actual downloaded files
|
154 |
+
logger.info(f"Current directory files (after extraction): {os.listdir('.')}")
|
|
|
|
|
|
|
|
|
|
|
155 |
|
156 |
+
# First see if we can find a subtitle file for the requested language
|
157 |
+
subtitle_files = []
|
158 |
+
if language:
|
159 |
+
subtitle_files = [f for f in os.listdir('.')
|
160 |
+
if f.startswith(video_id) and (language in f)]
|
161 |
|
162 |
+
# If no specific language requested or no files found for requested language,
|
163 |
+
# get any subtitle file for this video
|
164 |
+
if not subtitle_files:
|
165 |
+
subtitle_files = [f for f in os.listdir('.')
|
166 |
+
if f.startswith(video_id) and
|
167 |
+
any(f.endswith(ext) for ext in ['.vtt', '.srt', '.ttml', '.json3'])]
|
168 |
+
|
169 |
+
logger.info(f"Potential subtitle files: {subtitle_files}")
|
170 |
|
171 |
+
if subtitle_files:
|
172 |
+
# Process the first found subtitle file
|
173 |
+
subtitle_file = subtitle_files[0]
|
174 |
+
logger.info(f"Processing subtitle file: {subtitle_file}")
|
175 |
+
|
176 |
+
with open(subtitle_file, 'r', encoding='utf-8') as f:
|
177 |
content = f.read()
|
178 |
|
179 |
+
# Add format-specific parsing
|
180 |
+
if subtitle_file.endswith('.json3'):
|
181 |
+
import json
|
182 |
+
subs = json.loads(content)
|
183 |
+
text = ' '.join([e['segs'][0]['utf8'] for e in subs['events'] if e.get('segs')])
|
184 |
+
elif subtitle_file.endswith('.vtt'):
|
185 |
+
text = ' '.join(line.strip() for line in content.split('\n')
|
186 |
+
if not line.startswith('WEBVTT')
|
187 |
+
and '-->' not in line
|
188 |
+
and not line.strip().isdigit()
|
189 |
+
and line.strip())
|
190 |
+
elif subtitle_file.endswith('.srt'):
|
191 |
+
# Simple SRT parsing - skip timestamps and numbers
|
192 |
+
lines = []
|
193 |
+
for line in content.split('\n'):
|
194 |
+
if not line.strip().isdigit() and '-->' not in line and line.strip():
|
195 |
+
lines.append(line.strip())
|
196 |
+
text = ' '.join(lines)
|
197 |
+
else:
|
198 |
+
text = f"Unsupported format: {subtitle_file}"
|
199 |
|
200 |
+
return {"transcript": text, "language": subtitle_file.split('.')[-2] if '.' in subtitle_file else "unknown"}
|
|
|
|
|
|
|
|
|
201 |
|
202 |
+
return {"transcript": f"No subtitle files found for {video_id}", "language": "none"}
|
|
|
|
|
|
|
|
|
203 |
except Exception as e:
|
204 |
+
logger.error(f"Error: {str(e)}", exc_info=True)
|
205 |
+
raise HTTPException(status_code=500, detail=str(e))
|
206 |
|
207 |
|
208 |
|