Spaces:

Chrunos
/

mmmm

Running

App Files Files Community

Chrunos commited on Mar 22

Commit

0651ec2

verified ·

1 Parent(s): 888ac00

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -52

app.py CHANGED Viewed

@@ -128,77 +128,81 @@ async def get_video_url(youtube_url: str):
 @app.get("/script")
-async def get_transcript(youtube_url: str):
     try:
         ydl_opts = {
             'skip_download': True,
-            'writesubtitles': True,       # 仅下载手动上传的字幕
-            'writeautomaticsub': False,   # 禁用自动生成字幕
-            'subtitleslangs': ['orig'],   # 关键参数：只获取原始语言
             'subtitlesformat': 'best',
             'outtmpl': '%(id)s.%(ext)s',
             'noplaylist': True,
-            'cookiefile': "firefox-cookies.txt",
-            'ignoreerrors': True,
-            # 精准控制参数
-            'postprocessors': [],
-            'compat_opts': [
-                'no-youtube-unavailable-videos',
-                'no-sub-translate'
-            ],
         }
         env_to_cookies_from_env("firefox-cookies.txt")
         with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-            # 精确控制下载行为
-            info = ydl.extract_info(youtube_url, download=False)  # 先获取元数据
             video_id = info['id']
-            # 直接构建原始语言字幕文件名
-            original_lang = info.get('original_lang') or 'en'
-            expected_files = [
-                f"{video_id}.{original_lang}.vtt",
-                f"{video_id}.{original_lang}.srt",
-                f"{video_id}.{original_lang}.json3"
-            ]
-            # 检查本地是否已有字幕文件
-            existing_subs = [f for f in expected_files if os.path.exists(f)]
-            if not existing_subs:
-                # 精准下载单个字幕文件
-                ydl.params.update({'writesubtitles': True})
-                ydl.download([youtube_url])
-            # 重新检查下载结果
-            existing_subs = [f for f in expected_files if os.path.exists(f)]
-            if existing_subs:
-                with open(existing_subs[0], 'r', encoding='utf-8') as f:
                     content = f.read()
-                # 基础清理（保持原格式）
-                clean_content = re.sub(
-                    r'^\d+\n\d{2}:\d{2}:\d{2}[\.,]\d{3} --> \d{2}:\d{2}:\d{2}[\.,]\d{3}\n',
-                    '',
-                    content,
-                    flags=re.MULTILINE
-                ).strip()
-                return {
-                    "transcript": clean_content,
-                    "language": original_lang,
-                    "source": "original"
-                }
-            return {
-                "error": "无原始语言字幕",
-                "available": bool(info.get('subtitles'))
-            }
     except Exception as e:
-        return {"error": f"处理失败: {str(e)}"}

 @app.get("/script")
+async def get_transcript(youtube_url: str, language: str = None):
     try:
+        # If no specific language is requested, we'll try to get any available subtitle
         ydl_opts = {
             'skip_download': True,
+            'writesubtitles': True,
+            'writeautomaticsub': True,
+            'subtitleslangs': ['all'] if not language else [language],
             'subtitlesformat': 'best',
             'outtmpl': '%(id)s.%(ext)s',
             'noplaylist': True,
+            'cookiefile': "firefox-cookies.txt"
         }
         env_to_cookies_from_env("firefox-cookies.txt")
+        # Show current directory structure before download
+        logger.info(f"Current directory files (before): {os.listdir('.')}")
         with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            info = ydl.extract_info(youtube_url, download=True)
             video_id = info['id']
+            logger.info(f"Video ID: {video_id}")
+            # Check actual downloaded files
+            logger.info(f"Current directory files (after extraction): {os.listdir('.')}")
+            # First see if we can find a subtitle file for the requested language
+            subtitle_files = []
+            if language:
+                subtitle_files = [f for f in os.listdir('.')
+                                if f.startswith(video_id) and (language in f)]
+            # If no specific language requested or no files found for requested language,
+            # get any subtitle file for this video
+            if not subtitle_files:
+                subtitle_files = [f for f in os.listdir('.')
+                                if f.startswith(video_id) and
+                                any(f.endswith(ext) for ext in ['.vtt', '.srt', '.ttml', '.json3'])]
+            logger.info(f"Potential subtitle files: {subtitle_files}")
+            if subtitle_files:
+                # Process the first found subtitle file
+                subtitle_file = subtitle_files[0]
+                logger.info(f"Processing subtitle file: {subtitle_file}")
+                with open(subtitle_file, 'r', encoding='utf-8') as f:
                     content = f.read()
+                # Add format-specific parsing
+                if subtitle_file.endswith('.json3'):
+                    import json
+                    subs = json.loads(content)
+                    text = ' '.join([e['segs'][0]['utf8'] for e in subs['events'] if e.get('segs')])
+                elif subtitle_file.endswith('.vtt'):
+                    text = ' '.join(line.strip() for line in content.split('\n')
+                           if not line.startswith('WEBVTT')
+                           and '-->' not in line
+                           and not line.strip().isdigit()
+                           and line.strip())
+                elif subtitle_file.endswith('.srt'):
+                    # Simple SRT parsing - skip timestamps and numbers
+                    lines = []
+                    for line in content.split('\n'):
+                        if not line.strip().isdigit() and '-->' not in line and line.strip():
+                            lines.append(line.strip())
+                    text = ' '.join(lines)
+                else:
+                    text = f"Unsupported format: {subtitle_file}"
+                return {"transcript": text, "language": subtitle_file.split('.')[-2] if '.' in subtitle_file else "unknown"}
+            return {"transcript": f"No subtitle files found for {video_id}", "language": "none"}
     except Exception as e:
+        logger.error(f"Error: {str(e)}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))