Chrunos commited on
Commit
0651ec2
·
verified ·
1 Parent(s): 888ac00

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -52
app.py CHANGED
@@ -128,77 +128,81 @@ async def get_video_url(youtube_url: str):
128
 
129
 
130
  @app.get("/script")
131
- async def get_transcript(youtube_url: str):
132
  try:
 
133
  ydl_opts = {
134
  'skip_download': True,
135
- 'writesubtitles': True, # 仅下载手动上传的字幕
136
- 'writeautomaticsub': False, # 禁用自动生成字幕
137
- 'subtitleslangs': ['orig'], # 关键参数:只获取原始语言
138
  'subtitlesformat': 'best',
139
  'outtmpl': '%(id)s.%(ext)s',
140
  'noplaylist': True,
141
- 'cookiefile': "firefox-cookies.txt",
142
- 'ignoreerrors': True,
143
- # 精准控制参数
144
- 'postprocessors': [],
145
- 'compat_opts': [
146
- 'no-youtube-unavailable-videos',
147
- 'no-sub-translate'
148
- ],
149
  }
150
-
151
  env_to_cookies_from_env("firefox-cookies.txt")
152
-
 
 
153
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
154
- # 精确控制下载行为
155
- info = ydl.extract_info(youtube_url, download=False) # 先获取元数据
156
  video_id = info['id']
 
157
 
158
- # 直接构建原始语言字幕文件名
159
- original_lang = info.get('original_lang') or 'en'
160
- expected_files = [
161
- f"{video_id}.{original_lang}.vtt",
162
- f"{video_id}.{original_lang}.srt",
163
- f"{video_id}.{original_lang}.json3"
164
- ]
165
 
166
- # 检查本地是否已有字幕文件
167
- existing_subs = [f for f in expected_files if os.path.exists(f)]
 
 
 
168
 
169
- if not existing_subs:
170
- # 精准下载单个字幕文件
171
- ydl.params.update({'writesubtitles': True})
172
- ydl.download([youtube_url])
173
-
174
- # 重新检查下载结果
175
- existing_subs = [f for f in expected_files if os.path.exists(f)]
 
176
 
177
- if existing_subs:
178
- with open(existing_subs[0], 'r', encoding='utf-8') as f:
 
 
 
 
179
  content = f.read()
180
 
181
- # 基础清理(保持原格式)
182
- clean_content = re.sub(
183
- r'^\d+\n\d{2}:\d{2}:\d{2}[\.,]\d{3} --> \d{2}:\d{2}:\d{2}[\.,]\d{3}\n',
184
- '',
185
- content,
186
- flags=re.MULTILINE
187
- ).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
- return {
190
- "transcript": clean_content,
191
- "language": original_lang,
192
- "source": "original"
193
- }
194
 
195
- return {
196
- "error": "无原始语言字幕",
197
- "available": bool(info.get('subtitles'))
198
- }
199
-
200
  except Exception as e:
201
- return {"error": f"处理失败: {str(e)}"}
 
202
 
203
 
204
 
 
128
 
129
 
130
  @app.get("/script")
131
+ async def get_transcript(youtube_url: str, language: str = None):
132
  try:
133
+ # If no specific language is requested, we'll try to get any available subtitle
134
  ydl_opts = {
135
  'skip_download': True,
136
+ 'writesubtitles': True,
137
+ 'writeautomaticsub': True,
138
+ 'subtitleslangs': ['all'] if not language else [language],
139
  'subtitlesformat': 'best',
140
  'outtmpl': '%(id)s.%(ext)s',
141
  'noplaylist': True,
142
+ 'cookiefile': "firefox-cookies.txt"
 
 
 
 
 
 
 
143
  }
 
144
  env_to_cookies_from_env("firefox-cookies.txt")
145
+ # Show current directory structure before download
146
+ logger.info(f"Current directory files (before): {os.listdir('.')}")
147
+
148
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
149
+ info = ydl.extract_info(youtube_url, download=True)
 
150
  video_id = info['id']
151
+ logger.info(f"Video ID: {video_id}")
152
 
153
+ # Check actual downloaded files
154
+ logger.info(f"Current directory files (after extraction): {os.listdir('.')}")
 
 
 
 
 
155
 
156
+ # First see if we can find a subtitle file for the requested language
157
+ subtitle_files = []
158
+ if language:
159
+ subtitle_files = [f for f in os.listdir('.')
160
+ if f.startswith(video_id) and (language in f)]
161
 
162
+ # If no specific language requested or no files found for requested language,
163
+ # get any subtitle file for this video
164
+ if not subtitle_files:
165
+ subtitle_files = [f for f in os.listdir('.')
166
+ if f.startswith(video_id) and
167
+ any(f.endswith(ext) for ext in ['.vtt', '.srt', '.ttml', '.json3'])]
168
+
169
+ logger.info(f"Potential subtitle files: {subtitle_files}")
170
 
171
+ if subtitle_files:
172
+ # Process the first found subtitle file
173
+ subtitle_file = subtitle_files[0]
174
+ logger.info(f"Processing subtitle file: {subtitle_file}")
175
+
176
+ with open(subtitle_file, 'r', encoding='utf-8') as f:
177
  content = f.read()
178
 
179
+ # Add format-specific parsing
180
+ if subtitle_file.endswith('.json3'):
181
+ import json
182
+ subs = json.loads(content)
183
+ text = ' '.join([e['segs'][0]['utf8'] for e in subs['events'] if e.get('segs')])
184
+ elif subtitle_file.endswith('.vtt'):
185
+ text = ' '.join(line.strip() for line in content.split('\n')
186
+ if not line.startswith('WEBVTT')
187
+ and '-->' not in line
188
+ and not line.strip().isdigit()
189
+ and line.strip())
190
+ elif subtitle_file.endswith('.srt'):
191
+ # Simple SRT parsing - skip timestamps and numbers
192
+ lines = []
193
+ for line in content.split('\n'):
194
+ if not line.strip().isdigit() and '-->' not in line and line.strip():
195
+ lines.append(line.strip())
196
+ text = ' '.join(lines)
197
+ else:
198
+ text = f"Unsupported format: {subtitle_file}"
199
 
200
+ return {"transcript": text, "language": subtitle_file.split('.')[-2] if '.' in subtitle_file else "unknown"}
 
 
 
 
201
 
202
+ return {"transcript": f"No subtitle files found for {video_id}", "language": "none"}
 
 
 
 
203
  except Exception as e:
204
+ logger.error(f"Error: {str(e)}", exc_info=True)
205
+ raise HTTPException(status_code=500, detail=str(e))
206
 
207
 
208