Chrunos commited on
Commit
60e52f6
·
verified ·
1 Parent(s): 2dc08f2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -18
app.py CHANGED
@@ -168,24 +168,48 @@ async def get_transcript(youtube_url: str):
168
 
169
  # Add format-specific parsing
170
  if subtitle_file.endswith('.json3'):
171
- import json
172
- subs = json.loads(content)
173
- text = ' '.join([e['segs'][0]['utf8'] for e in subs['events'] if e.get('segs')])
174
- elif subtitle_file.endswith('.vtt'):
175
- text = ' '.join(line.strip() for line in content.split('\n')
176
- if not line.startswith('WEBVTT')
177
- and '-->' not in line
178
- and not line.strip().isdigit())
179
- else:
180
- text = f"Unsupported format: {subtitle_file}"
181
-
182
- return {"transcript": text}
183
-
184
- return {"transcript": f"No subtitle files found for {video_id}"}
185
-
186
- except Exception as e:
187
- logger.error(f"Error: {str(e)}", exc_info=True)
188
- raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
 
191
 
 
168
 
169
  # Add format-specific parsing
170
  if subtitle_file.endswith('.json3'):
171
+ import json
172
+ subs = json.loads(content)
173
+ # Extract text segments and clean duplicates
174
+ segments = []
175
+ seen = set()
176
+ for event in subs['events']:
177
+ if 'segs' in event and event['segs']:
178
+ text = event['segs'][0]['utf8'].strip()
179
+ if text and text not in seen:
180
+ segments.append(text)
181
+ seen.add(text)
182
+ transcript = ' '.join(segments)
183
+
184
+ elif subtitle_file.endswith('.vtt'):
185
+ # Parse VTT format
186
+ transcript = []
187
+ current_text = ''
188
+ for line in content.split('\n'):
189
+ if '-->' in line or line.strip().isdigit() or line.startswith('WEBVTT'):
190
+ if current_text:
191
+ transcript.append(current_text.strip())
192
+ current_text = ''
193
+ continue
194
+ if line.strip() and not line.startswith('NOTE'):
195
+ current_text += ' ' + line.strip()
196
+ transcript = ' '.join(list(dict.fromkeys(transcript))) # Remove duplicates while preserving order
197
+
198
+ else:
199
+ transcript = "Unsupported subtitle format"
200
+
201
+ # Post-process formatting
202
+ cleaned_transcript = (
203
+ transcript.replace(" ", " ") # Remove double spaces
204
+ .replace("hi ", "") # Remove residual VTT artifacts
205
+ .replace("Kind: captions Language: en", "")
206
+ .strip()
207
+ )
208
+
209
+ return {"transcript": cleaned_transcript}
210
+
211
+ except Exception as e:
212
+ raise HTTPException(status_code=500, detail=str(e))
213
 
214
 
215