Chrunos commited on
Commit
26dfd2b
·
verified ·
1 Parent(s): 7cc734b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -36
app.py CHANGED
@@ -170,45 +170,21 @@ async def get_transcript(youtube_url: str):
170
  if subtitle_file.endswith('.json3'):
171
  import json
172
  subs = json.loads(content)
173
- # Extract text segments and clean duplicates
174
- segments = []
175
- seen = set()
176
- for event in subs['events']:
177
- if 'segs' in event and event['segs']:
178
- text = event['segs'][0]['utf8'].strip()
179
- if text and text not in seen:
180
- segments.append(text)
181
- seen.add(text)
182
- transcript = ' '.join(segments)
183
-
184
  elif subtitle_file.endswith('.vtt'):
185
- # Parse VTT format
186
- transcript = []
187
- current_text = ''
188
- for line in content.split('\n'):
189
- if '-->' in line or line.strip().isdigit() or line.startswith('WEBVTT'):
190
- if current_text:
191
- transcript.append(current_text.strip())
192
- current_text = ''
193
- continue
194
- if line.strip() and not line.startswith('NOTE'):
195
- current_text += ' ' + line.strip()
196
- transcript = ' '.join(list(dict.fromkeys(transcript))) # Remove duplicates while preserving order
197
-
198
  else:
199
- transcript = "Unsupported subtitle format"
200
-
201
- # Post-process formatting
202
- cleaned_transcript = (
203
- transcript.replace(" ", " ") # Remove double spaces
204
- .replace("hi ", "") # Remove residual VTT artifacts
205
- .replace("Kind: captions Language: en", "")
206
- .strip()
207
- )
208
-
209
- return {"transcript": cleaned_transcript}
210
-
211
  except Exception as e:
 
212
  raise HTTPException(status_code=500, detail=str(e))
213
 
214
 
 
170
  if subtitle_file.endswith('.json3'):
171
  import json
172
  subs = json.loads(content)
173
+ text = ' '.join([e['segs'][0]['utf8'] for e in subs['events'] if e.get('segs')])
 
 
 
 
 
 
 
 
 
 
174
  elif subtitle_file.endswith('.vtt'):
175
+ text = ' '.join(line.strip() for line in content.split('\n')
176
+ if not line.startswith('WEBVTT')
177
+ and '-->' not in line
178
+ and not line.strip().isdigit())
 
 
 
 
 
 
 
 
 
179
  else:
180
+ text = f"Unsupported format: {subtitle_file}"
181
+
182
+ return {"transcript": text}
183
+
184
+ return {"transcript": f"No subtitle files found for {video_id}"}
185
+
 
 
 
 
 
 
186
  except Exception as e:
187
+ logger.error(f"Error: {str(e)}", exc_info=True)
188
  raise HTTPException(status_code=500, detail=str(e))
189
 
190