Update app.py
Browse files
app.py
CHANGED
@@ -126,6 +126,7 @@ async def get_video_url(youtube_url: str):
|
|
126 |
|
127 |
|
128 |
|
|
|
129 |
@app.get("/script")
|
130 |
async def get_transcript(youtube_url: str):
|
131 |
try:
|
@@ -133,48 +134,61 @@ async def get_transcript(youtube_url: str):
|
|
133 |
'skip_download': True,
|
134 |
'writesubtitles': True,
|
135 |
'writeautomaticsub': True,
|
136 |
-
'subtitleslangs': ['en'],
|
137 |
-
'subtitlesformat': '
|
138 |
-
'outtmpl': '%(id)s',
|
139 |
'noplaylist': True,
|
140 |
'cookiefile': "firefox-cookies.txt",
|
|
|
141 |
}
|
142 |
|
143 |
-
|
144 |
-
|
|
|
145 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
146 |
info = ydl.extract_info(youtube_url, download=False)
|
147 |
video_id = info['id']
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
|
|
152 |
|
153 |
-
|
154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
with open(subtitle_file, 'r', encoding='utf-8') as f:
|
156 |
-
|
157 |
|
158 |
-
#
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
fallback_file = f"{video_id}.{lang}.{ext}"
|
171 |
-
if os.path.exists(fallback_file):
|
172 |
-
# Handle other formats if needed
|
173 |
-
return {"transcript": f"Found {ext} but parsing not implemented"}
|
174 |
|
175 |
-
return {"transcript":
|
|
|
|
|
176 |
|
177 |
except Exception as e:
|
|
|
178 |
raise HTTPException(status_code=500, detail=str(e))
|
179 |
|
180 |
|
|
|
126 |
|
127 |
|
128 |
|
129 |
+
|
130 |
@app.get("/script")
|
131 |
async def get_transcript(youtube_url: str):
|
132 |
try:
|
|
|
134 |
'skip_download': True,
|
135 |
'writesubtitles': True,
|
136 |
'writeautomaticsub': True,
|
137 |
+
'subtitleslangs': ['en'],
|
138 |
+
'subtitlesformat': 'best',
|
139 |
+
'outtmpl': '%(id)s.%(ext)s',
|
140 |
'noplaylist': True,
|
141 |
'cookiefile': "firefox-cookies.txt",
|
142 |
+
'logger': logger,
|
143 |
}
|
144 |
|
145 |
+
# Show current directory structure before download
|
146 |
+
logger.info(f"Current directory files (before): {os.listdir('.')}")
|
147 |
+
|
148 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
149 |
info = ydl.extract_info(youtube_url, download=False)
|
150 |
video_id = info['id']
|
151 |
+
logger.info(f"Video ID: {video_id}")
|
152 |
+
|
153 |
+
# Log available subtitle information
|
154 |
+
logger.info(f"Subtitles available: {info.get('subtitles')}")
|
155 |
+
logger.info(f"Auto subtitles available: {info.get('automatic_captions')}")
|
156 |
|
157 |
+
# Check actual downloaded files
|
158 |
+
logger.info(f"Current directory files (after extraction): {os.listdir('.')}")
|
159 |
+
|
160 |
+
# Search for subtitle files pattern
|
161 |
+
subtitle_files = [f for f in os.listdir('.')
|
162 |
+
if f.startswith(video_id) and ('en' in f)]
|
163 |
+
logger.info(f"Potential subtitle files: {subtitle_files}")
|
164 |
+
|
165 |
+
if subtitle_files:
|
166 |
+
# Process the first found subtitle file
|
167 |
+
subtitle_file = subtitle_files[0]
|
168 |
+
logger.info(f"Processing subtitle file: {subtitle_file}")
|
169 |
+
|
170 |
with open(subtitle_file, 'r', encoding='utf-8') as f:
|
171 |
+
content = f.read()
|
172 |
|
173 |
+
# Add format-specific parsing
|
174 |
+
if subtitle_file.endswith('.json3'):
|
175 |
+
import json
|
176 |
+
subs = json.loads(content)
|
177 |
+
text = ' '.join([e['segs'][0]['utf8'] for e in subs['events'] if e.get('segs'))
|
178 |
+
elif subtitle_file.endswith('.vtt'):
|
179 |
+
text = ' '.join(line.strip() for line in content.split('\n')
|
180 |
+
if not line.startswith('WEBVTT')
|
181 |
+
and '-->' not in line
|
182 |
+
and not line.strip().isdigit())
|
183 |
+
else:
|
184 |
+
text = f"Unsupported format: {subtitle_file}"
|
|
|
|
|
|
|
|
|
185 |
|
186 |
+
return {"transcript": text}
|
187 |
+
|
188 |
+
return {"transcript": f"No subtitle files found for {video_id}"}
|
189 |
|
190 |
except Exception as e:
|
191 |
+
logger.error(f"Error: {str(e)}", exc_info=True)
|
192 |
raise HTTPException(status_code=500, detail=str(e))
|
193 |
|
194 |
|