Spaces:

Chrunos
/

zams

Running

App Files Files Community

Chrunos commited on Mar 22

Commit

cc7aba2

verified ·

1 Parent(s): 60e52f6

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -42

app.py CHANGED Viewed

@@ -168,48 +168,48 @@ async def get_transcript(youtube_url: str):
                 # Add format-specific parsing
                 if subtitle_file.endswith('.json3'):
-                import json
-                subs = json.loads(content)
-                # Extract text segments and clean duplicates
-                segments = []
-                seen = set()
-                for event in subs['events']:
-                    if 'segs' in event and event['segs']:
-                        text = event['segs'][0]['utf8'].strip()
-                        if text and text not in seen:
-                            segments.append(text)
-                            seen.add(text)
-                transcript = ' '.join(segments)
-            elif subtitle_file.endswith('.vtt'):
-                # Parse VTT format
-                transcript = []
-                current_text = ''
-                for line in content.split('\n'):
-                    if '-->' in line or line.strip().isdigit() or line.startswith('WEBVTT'):
-                        if current_text:
-                            transcript.append(current_text.strip())
-                            current_text = ''
-                        continue
-                    if line.strip() and not line.startswith('NOTE'):
-                        current_text += ' ' + line.strip()
-                transcript = ' '.join(list(dict.fromkeys(transcript)))  # Remove duplicates while preserving order
-            else:
-                transcript = "Unsupported subtitle format"
-            # Post-process formatting
-            cleaned_transcript = (
-                transcript.replace("  ", " ")  # Remove double spaces
-                .replace("hi ", "")  # Remove residual VTT artifacts
-                .replace("Kind: captions Language: en", "")
-                .strip()
-            )
-            return {"transcript": cleaned_transcript}
-        except Exception as e:
-            raise HTTPException(status_code=500, detail=str(e))

                 # Add format-specific parsing
                 if subtitle_file.endswith('.json3'):
+                    import json
+                    subs = json.loads(content)
+                    # Extract text segments and clean duplicates
+                    segments = []
+                    seen = set()
+                    for event in subs['events']:
+                        if 'segs' in event and event['segs']:
+                            text = event['segs'][0]['utf8'].strip()
+                            if text and text not in seen:
+                                segments.append(text)
+                                seen.add(text)
+                    transcript = ' '.join(segments)
+                elif subtitle_file.endswith('.vtt'):
+                    # Parse VTT format
+                    transcript = []
+                    current_text = ''
+                    for line in content.split('\n'):
+                        if '-->' in line or line.strip().isdigit() or line.startswith('WEBVTT'):
+                            if current_text:
+                                transcript.append(current_text.strip())
+                                current_text = ''
+                            continue
+                        if line.strip() and not line.startswith('NOTE'):
+                            current_text += ' ' + line.strip()
+                    transcript = ' '.join(list(dict.fromkeys(transcript)))  # Remove duplicates while preserving order
+                else:
+                    transcript = "Unsupported subtitle format"
+                # Post-process formatting
+                cleaned_transcript = (
+                    transcript.replace("  ", " ")  # Remove double spaces
+                    .replace("hi ", "")  # Remove residual VTT artifacts
+                    .replace("Kind: captions Language: en", "")
+                    .strip()
+                )
+                return {"transcript": cleaned_transcript}
+            except Exception as e:
+                raise HTTPException(status_code=500, detail=str(e))