Chrunos commited on
Commit
4b467a4
·
verified ·
1 Parent(s): f2946ca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -29
app.py CHANGED
@@ -126,6 +126,7 @@ async def get_video_url(youtube_url: str):
126
 
127
 
128
 
 
129
  @app.get("/script")
130
  async def get_transcript(youtube_url: str):
131
  try:
@@ -133,48 +134,61 @@ async def get_transcript(youtube_url: str):
133
  'skip_download': True,
134
  'writesubtitles': True,
135
  'writeautomaticsub': True,
136
- 'subtitleslangs': ['en'], # Target language
137
- 'subtitlesformat': 'json3', # Force JSON format
138
- 'outtmpl': '%(id)s', # Output template
139
  'noplaylist': True,
140
  'cookiefile': "firefox-cookies.txt",
 
141
  }
142
 
143
- env_to_cookies_from_env("firefox-cookies.txt")
144
-
 
145
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
146
  info = ydl.extract_info(youtube_url, download=False)
147
  video_id = info['id']
148
-
149
- # Get actual downloaded subtitle format
150
- sub_ext = 'json3' # Since we're forcing json3 format
151
- lang = ydl_opts['subtitleslangs'][0]
 
152
 
153
- subtitle_file = f"{video_id}.{lang}.{sub_ext}"
154
- if os.path.exists(subtitle_file):
 
 
 
 
 
 
 
 
 
 
 
155
  with open(subtitle_file, 'r', encoding='utf-8') as f:
156
- subs = json.load(f)
157
 
158
- # Extract text from JSON3 format
159
- text = ' '.join(
160
- [event['segs'][0]['utf8']
161
- for event in subs['events']
162
- if 'segs' in event and event['segs']
163
- ]
164
- )
165
- return {"transcript": text}
166
- else:
167
- # Fallback check for other possible formats
168
- possible_exts = ['vtt', 'srt', 'ttml', 'json3']
169
- for ext in possible_exts:
170
- fallback_file = f"{video_id}.{lang}.{ext}"
171
- if os.path.exists(fallback_file):
172
- # Handle other formats if needed
173
- return {"transcript": f"Found {ext} but parsing not implemented"}
174
 
175
- return {"transcript": f"No subtitle file found for {video_id}"}
 
 
176
 
177
  except Exception as e:
 
178
  raise HTTPException(status_code=500, detail=str(e))
179
 
180
 
 
126
 
127
 
128
 
129
+
130
  @app.get("/script")
131
  async def get_transcript(youtube_url: str):
132
  try:
 
134
  'skip_download': True,
135
  'writesubtitles': True,
136
  'writeautomaticsub': True,
137
+ 'subtitleslangs': ['en'],
138
+ 'subtitlesformat': 'best',
139
+ 'outtmpl': '%(id)s.%(ext)s',
140
  'noplaylist': True,
141
  'cookiefile': "firefox-cookies.txt",
142
+ 'logger': logger,
143
  }
144
 
145
+ # Show current directory structure before download
146
+ logger.info(f"Current directory files (before): {os.listdir('.')}")
147
+
148
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
149
  info = ydl.extract_info(youtube_url, download=False)
150
  video_id = info['id']
151
+ logger.info(f"Video ID: {video_id}")
152
+
153
+ # Log available subtitle information
154
+ logger.info(f"Subtitles available: {info.get('subtitles')}")
155
+ logger.info(f"Auto subtitles available: {info.get('automatic_captions')}")
156
 
157
+ # Check actual downloaded files
158
+ logger.info(f"Current directory files (after extraction): {os.listdir('.')}")
159
+
160
+ # Search for subtitle files pattern
161
+ subtitle_files = [f for f in os.listdir('.')
162
+ if f.startswith(video_id) and ('en' in f)]
163
+ logger.info(f"Potential subtitle files: {subtitle_files}")
164
+
165
+ if subtitle_files:
166
+ # Process the first found subtitle file
167
+ subtitle_file = subtitle_files[0]
168
+ logger.info(f"Processing subtitle file: {subtitle_file}")
169
+
170
  with open(subtitle_file, 'r', encoding='utf-8') as f:
171
+ content = f.read()
172
 
173
+ # Add format-specific parsing
174
+ if subtitle_file.endswith('.json3'):
175
+ import json
176
+ subs = json.loads(content)
177
+ text = ' '.join([e['segs'][0]['utf8'] for e in subs['events'] if e.get('segs'))
178
+ elif subtitle_file.endswith('.vtt'):
179
+ text = ' '.join(line.strip() for line in content.split('\n')
180
+ if not line.startswith('WEBVTT')
181
+ and '-->' not in line
182
+ and not line.strip().isdigit())
183
+ else:
184
+ text = f"Unsupported format: {subtitle_file}"
 
 
 
 
185
 
186
+ return {"transcript": text}
187
+
188
+ return {"transcript": f"No subtitle files found for {video_id}"}
189
 
190
  except Exception as e:
191
+ logger.error(f"Error: {str(e)}", exc_info=True)
192
  raise HTTPException(status_code=500, detail=str(e))
193
 
194