Spaces:

Chrunos
/

zams

Running

App Files Files Community

Chrunos commited on Mar 22

Commit

888ac00

verified ·

1 Parent(s): 3913791

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -28

app.py CHANGED Viewed

@@ -132,59 +132,73 @@ async def get_transcript(youtube_url: str):
     try:
         ydl_opts = {
             'skip_download': True,
-            'writesubtitles': True,
-            'writeautomaticsub': True,
-            'subtitleslangs': ['all'],  # 获取所有字幕
             'subtitlesformat': 'best',
             'outtmpl': '%(id)s.%(ext)s',
             'noplaylist': True,
             'cookiefile': "firefox-cookies.txt",
             'ignoreerrors': True,
         }
         env_to_cookies_from_env("firefox-cookies.txt")
         with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-            info = ydl.extract_info(youtube_url, download=True)
             video_id = info['id']
-            # 合并手动和自动字幕列表
-            all_subs = list(info.get('subtitles', {}).keys()) + list(info.get('automatic_captions', {}).keys())
-            # 过滤有效语言代码（排除翻译字幕如en-zh-Hant）
-            valid_langs = [lang for lang in all_subs if '-' not in lang.split('.')[-1]]
-            if not valid_langs:
-                return {"error": "无可用字幕"}
-            # 直接取第一个有效语言
-            target_lang = valid_langs[0]
-            # 构建字幕文件名模式
-            subtitle_file = f"{video_id}.{target_lang}.vtt"  # 优先vtt格式
-            if not os.path.exists(subtitle_file):
-                subtitle_file = f"{video_id}.{target_lang}.srt"
-            if os.path.exists(subtitle_file):
-                with open(subtitle_file, 'r', encoding='utf-8') as f:
                     content = f.read()
-                # 基础清理
                 clean_content = re.sub(
-                    r'\d{2}:\d{2}:\d{2}[,.]\d{3}.*?\n',
-                    '',
-                    content
                 ).strip()
                 return {
                     "transcript": clean_content,
-                    "detected_lang": target_lang
                 }
-            return {"error": "字幕文件未找到"}
     except Exception as e:
-        return {"error": str(e)}

     try:
         ydl_opts = {
             'skip_download': True,
+            'writesubtitles': True,       # 仅下载手动上传的字幕
+            'writeautomaticsub': False,   # 禁用自动生成字幕
+            'subtitleslangs': ['orig'],   # 关键参数：只获取原始语言
             'subtitlesformat': 'best',
             'outtmpl': '%(id)s.%(ext)s',
             'noplaylist': True,
             'cookiefile': "firefox-cookies.txt",
             'ignoreerrors': True,
+            # 精准控制参数
+            'postprocessors': [],
+            'compat_opts': [
+                'no-youtube-unavailable-videos',
+                'no-sub-translate'
+            ],
         }
         env_to_cookies_from_env("firefox-cookies.txt")
         with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            # 精确控制下载行为
+            info = ydl.extract_info(youtube_url, download=False)  # 先获取元数据
             video_id = info['id']
+            # 直接构建原始语言字幕文件名
+            original_lang = info.get('original_lang') or 'en'
+            expected_files = [
+                f"{video_id}.{original_lang}.vtt",
+                f"{video_id}.{original_lang}.srt",
+                f"{video_id}.{original_lang}.json3"
+            ]
+            # 检查本地是否已有字幕文件
+            existing_subs = [f for f in expected_files if os.path.exists(f)]
+            if not existing_subs:
+                # 精准下载单个字幕文件
+                ydl.params.update({'writesubtitles': True})
+                ydl.download([youtube_url])
+            # 重新检查下载结果
+            existing_subs = [f for f in expected_files if os.path.exists(f)]
+            if existing_subs:
+                with open(existing_subs[0], 'r', encoding='utf-8') as f:
                     content = f.read()
+                # 基础清理（保持原格式）
                 clean_content = re.sub(
+                    r'^\d+\n\d{2}:\d{2}:\d{2}[\.,]\d{3} --> \d{2}:\d{2}:\d{2}[\.,]\d{3}\n',
+                    '',
+                    content,
+                    flags=re.MULTILINE
                 ).strip()
                 return {
                     "transcript": clean_content,
+                    "language": original_lang,
+                    "source": "original"
                 }
+            return {
+                "error": "无原始语言字幕",
+                "available": bool(info.get('subtitles'))
+            }
     except Exception as e:
+        return {"error": f"处理失败: {str(e)}"}