Spaces:

Chrunos
/

mmmm

Running

App Files Files Community

Chrunos commited on Mar 22

Commit

4b467a4

verified ·

1 Parent(s): f2946ca

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -29

app.py CHANGED Viewed

@@ -126,6 +126,7 @@ async def get_video_url(youtube_url: str):
 @app.get("/script")
 async def get_transcript(youtube_url: str):
     try:
@@ -133,48 +134,61 @@ async def get_transcript(youtube_url: str):
             'skip_download': True,
             'writesubtitles': True,
             'writeautomaticsub': True,
-            'subtitleslangs': ['en'],  # Target language
-            'subtitlesformat': 'json3',  # Force JSON format
-            'outtmpl': '%(id)s',        # Output template
             'noplaylist': True,
             'cookiefile': "firefox-cookies.txt",
         }
-        env_to_cookies_from_env("firefox-cookies.txt")
         with yt_dlp.YoutubeDL(ydl_opts) as ydl:
             info = ydl.extract_info(youtube_url, download=False)
             video_id = info['id']
-            # Get actual downloaded subtitle format
-            sub_ext = 'json3'  # Since we're forcing json3 format
-            lang = ydl_opts['subtitleslangs'][0]
-            subtitle_file = f"{video_id}.{lang}.{sub_ext}"
-            if os.path.exists(subtitle_file):
                 with open(subtitle_file, 'r', encoding='utf-8') as f:
-                    subs = json.load(f)
-                # Extract text from JSON3 format
-                text = ' '.join(
-                    [event['segs'][0]['utf8']
-                     for event in subs['events']
-                     if 'segs' in event and event['segs']
-                    ]
-                )
-                return {"transcript": text}
-            else:
-                # Fallback check for other possible formats
-                possible_exts = ['vtt', 'srt', 'ttml', 'json3']
-                for ext in possible_exts:
-                    fallback_file = f"{video_id}.{lang}.{ext}"
-                    if os.path.exists(fallback_file):
-                        # Handle other formats if needed
-                        return {"transcript": f"Found {ext} but parsing not implemented"}
-                return {"transcript": f"No subtitle file found for {video_id}"}
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))

 @app.get("/script")
 async def get_transcript(youtube_url: str):
     try:
             'skip_download': True,
             'writesubtitles': True,
             'writeautomaticsub': True,
+            'subtitleslangs': ['en'],
+            'subtitlesformat': 'best',
+            'outtmpl': '%(id)s.%(ext)s',
             'noplaylist': True,
             'cookiefile': "firefox-cookies.txt",
+            'logger': logger,
         }
+        # Show current directory structure before download
+        logger.info(f"Current directory files (before): {os.listdir('.')}")
         with yt_dlp.YoutubeDL(ydl_opts) as ydl:
             info = ydl.extract_info(youtube_url, download=False)
             video_id = info['id']
+            logger.info(f"Video ID: {video_id}")
+            # Log available subtitle information
+            logger.info(f"Subtitles available: {info.get('subtitles')}")
+            logger.info(f"Auto subtitles available: {info.get('automatic_captions')}")
+            # Check actual downloaded files
+            logger.info(f"Current directory files (after extraction): {os.listdir('.')}")
+            # Search for subtitle files pattern
+            subtitle_files = [f for f in os.listdir('.')
+                             if f.startswith(video_id) and ('en' in f)]
+            logger.info(f"Potential subtitle files: {subtitle_files}")
+            if subtitle_files:
+                # Process the first found subtitle file
+                subtitle_file = subtitle_files[0]
+                logger.info(f"Processing subtitle file: {subtitle_file}")
                 with open(subtitle_file, 'r', encoding='utf-8') as f:
+                    content = f.read()
+                # Add format-specific parsing
+                if subtitle_file.endswith('.json3'):
+                    import json
+                    subs = json.loads(content)
+                    text = ' '.join([e['segs'][0]['utf8'] for e in subs['events'] if e.get('segs'))
+                elif subtitle_file.endswith('.vtt'):
+                    text = ' '.join(line.strip() for line in content.split('\n')
+                           if not line.startswith('WEBVTT')
+                           and '-->' not in line
+                           and not line.strip().isdigit())
+                else:
+                    text = f"Unsupported format: {subtitle_file}"
+                return {"transcript": text}
+            return {"transcript": f"No subtitle files found for {video_id}"}
     except Exception as e:
+        logger.error(f"Error: {str(e)}", exc_info=True)
         raise HTTPException(status_code=500, detail=str(e))