Chrunos commited on
Commit
cc7aba2
·
verified ·
1 Parent(s): 60e52f6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -42
app.py CHANGED
@@ -168,48 +168,48 @@ async def get_transcript(youtube_url: str):
168
 
169
  # Add format-specific parsing
170
  if subtitle_file.endswith('.json3'):
171
- import json
172
- subs = json.loads(content)
173
- # Extract text segments and clean duplicates
174
- segments = []
175
- seen = set()
176
- for event in subs['events']:
177
- if 'segs' in event and event['segs']:
178
- text = event['segs'][0]['utf8'].strip()
179
- if text and text not in seen:
180
- segments.append(text)
181
- seen.add(text)
182
- transcript = ' '.join(segments)
183
-
184
- elif subtitle_file.endswith('.vtt'):
185
- # Parse VTT format
186
- transcript = []
187
- current_text = ''
188
- for line in content.split('\n'):
189
- if '-->' in line or line.strip().isdigit() or line.startswith('WEBVTT'):
190
- if current_text:
191
- transcript.append(current_text.strip())
192
- current_text = ''
193
- continue
194
- if line.strip() and not line.startswith('NOTE'):
195
- current_text += ' ' + line.strip()
196
- transcript = ' '.join(list(dict.fromkeys(transcript))) # Remove duplicates while preserving order
197
-
198
- else:
199
- transcript = "Unsupported subtitle format"
200
-
201
- # Post-process formatting
202
- cleaned_transcript = (
203
- transcript.replace(" ", " ") # Remove double spaces
204
- .replace("hi ", "") # Remove residual VTT artifacts
205
- .replace("Kind: captions Language: en", "")
206
- .strip()
207
- )
208
-
209
- return {"transcript": cleaned_transcript}
210
-
211
- except Exception as e:
212
- raise HTTPException(status_code=500, detail=str(e))
213
 
214
 
215
 
 
168
 
169
  # Add format-specific parsing
170
  if subtitle_file.endswith('.json3'):
171
+ import json
172
+ subs = json.loads(content)
173
+ # Extract text segments and clean duplicates
174
+ segments = []
175
+ seen = set()
176
+ for event in subs['events']:
177
+ if 'segs' in event and event['segs']:
178
+ text = event['segs'][0]['utf8'].strip()
179
+ if text and text not in seen:
180
+ segments.append(text)
181
+ seen.add(text)
182
+ transcript = ' '.join(segments)
183
+
184
+ elif subtitle_file.endswith('.vtt'):
185
+ # Parse VTT format
186
+ transcript = []
187
+ current_text = ''
188
+ for line in content.split('\n'):
189
+ if '-->' in line or line.strip().isdigit() or line.startswith('WEBVTT'):
190
+ if current_text:
191
+ transcript.append(current_text.strip())
192
+ current_text = ''
193
+ continue
194
+ if line.strip() and not line.startswith('NOTE'):
195
+ current_text += ' ' + line.strip()
196
+ transcript = ' '.join(list(dict.fromkeys(transcript))) # Remove duplicates while preserving order
197
+
198
+ else:
199
+ transcript = "Unsupported subtitle format"
200
+
201
+ # Post-process formatting
202
+ cleaned_transcript = (
203
+ transcript.replace(" ", " ") # Remove double spaces
204
+ .replace("hi ", "") # Remove residual VTT artifacts
205
+ .replace("Kind: captions Language: en", "")
206
+ .strip()
207
+ )
208
+
209
+ return {"transcript": cleaned_transcript}
210
+
211
+ except Exception as e:
212
+ raise HTTPException(status_code=500, detail=str(e))
213
 
214
 
215