Spaces:

Chrunos
/

mmmm

Running

App Files Files Community

Chrunos commited on Mar 22

Commit

f2946ca

verified ·

1 Parent(s): e862404

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -34

app.py CHANGED Viewed

@@ -133,9 +133,9 @@ async def get_transcript(youtube_url: str):
             'skip_download': True,
             'writesubtitles': True,
             'writeautomaticsub': True,
-            'subtitleslangs': ['en'],  # Try 'en.*' for all English variants
-            'subtitlesformat': 'best',  # Let yt-dlp choose best available format
-            'outtmpl': '%(id)s',        # Output template without extension
             'noplaylist': True,
             'cookiefile': "firefox-cookies.txt",
         }
@@ -146,38 +146,33 @@ async def get_transcript(youtube_url: str):
             info = ydl.extract_info(youtube_url, download=False)
             video_id = info['id']
-            # Find available subtitle format
-            sub_ext = None
-            for lang in ydl_opts['subtitleslangs']:
-                for sub_type in ['subtitles', 'automatic_captions']:
-                    subs = info.get(sub_type, {}).get(lang, [])
-                    if subs:
-                        sub_ext = subs[0].get('ext', 'vtt')
-                        break
-                if sub_ext:
-                    break
-            if sub_ext:
-                subtitle_file = f"{video_id}.{lang}.{sub_ext}"
-                if os.path.exists(subtitle_file):
-                    with open(subtitle_file, 'r', encoding='utf-8') as f:
-                        content = f.read()
-                    # Parse both VTT and SRT formats
-                    lines = content.split('\n')
-                    text_lines = []
-                    for line in lines:
-                        # Skip timestamps and metadata
-                        if '-->' in line or line.strip().isdigit() or line.startswith('WEBVTT'):
-                            continue
-                        if line.strip():
-                            text_lines.append(line.strip())
-                    return {"transcript": ' '.join(text_lines)}
-                else:
-                    return {"transcript": f"Found subtitles but file {subtitle_file} missing"}
             else:
-                return {"transcript": "No subtitles available in requested languages"}
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))

             'skip_download': True,
             'writesubtitles': True,
             'writeautomaticsub': True,
+            'subtitleslangs': ['en'],  # Target language
+            'subtitlesformat': 'json3',  # Force JSON format
+            'outtmpl': '%(id)s',        # Output template
             'noplaylist': True,
             'cookiefile': "firefox-cookies.txt",
         }
             info = ydl.extract_info(youtube_url, download=False)
             video_id = info['id']
+            # Get actual downloaded subtitle format
+            sub_ext = 'json3'  # Since we're forcing json3 format
+            lang = ydl_opts['subtitleslangs'][0]
+            subtitle_file = f"{video_id}.{lang}.{sub_ext}"
+            if os.path.exists(subtitle_file):
+                with open(subtitle_file, 'r', encoding='utf-8') as f:
+                    subs = json.load(f)
+                # Extract text from JSON3 format
+                text = ' '.join(
+                    [event['segs'][0]['utf8']
+                     for event in subs['events']
+                     if 'segs' in event and event['segs']
+                    ]
+                )
+                return {"transcript": text}
             else:
+                # Fallback check for other possible formats
+                possible_exts = ['vtt', 'srt', 'ttml', 'json3']
+                for ext in possible_exts:
+                    fallback_file = f"{video_id}.{lang}.{ext}"
+                    if os.path.exists(fallback_file):
+                        # Handle other formats if needed
+                        return {"transcript": f"Found {ext} but parsing not implemented"}
+                return {"transcript": f"No subtitle file found for {video_id}"}
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))