Spaces:

Chrunos
/

mmmm

Running

App Files Files Community

Chrunos commited on Mar 22

Commit

3d3b80f

verified ·

1 Parent(s): 942ffc3

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -66

app.py CHANGED Viewed

@@ -134,16 +134,15 @@ async def get_transcript(youtube_url: str):
             'skip_download': True,
             'writesubtitles': True,
             'writeautomaticsub': True,
-            # 扩展支持的语言列表（包含中英文）
-            'subtitleslangs': ['en', 'zh', 'zh-Hans', 'zh-Hant', 'zh-CN', 'zh-TW', 'a.*'],
-            # 自动选择最佳字幕格式
             'subtitlesformat': 'best',
-            # 包含原始语言代码在文件名中
-            'outtmpl': '%(id)s.%(ext)s',
             'noplaylist': True,
             'cookiefile': "firefox-cookies.txt",
-            # 添加中文字幕兼容性参数
-            'compat_opts': {'no-youtube-unavailable-videos'},
         }
         env_to_cookies_from_env("firefox-cookies.txt")
@@ -152,79 +151,56 @@ async def get_transcript(youtube_url: str):
             info = ydl.extract_info(youtube_url, download=True)
             video_id = info['id']
-            # 获取实际可用的字幕语言列表
-            available_subs = []
-            for sub_type in ['subtitles', 'automatic_captions']:
-                subs = info.get(sub_type, {})
-                available_subs.extend([
-                    f"{lang}.{track[0]['ext']}"
-                    for lang, track in subs.items()
-                ])
-            # 优先顺序：中文 > 英文 > 其他语言
-            priority_langs = ['zh', 'en']
             subtitle_files = sorted(
-                [f for f in os.listdir('.') if f.startswith(video_id)],
-                key=lambda x: (
-                    -max([x.find(lang) for lang in priority_langs]),
-                    len(x)
                 )
             )
-            logger.info(f"Detected subtitle files: {subtitle_files}")
             if subtitle_files:
-                subtitle_file = subtitle_files[0]
-                logger.info(f"Selected subtitle: {subtitle_file}")
-                with open(subtitle_file, 'r', encoding='utf-8') as f:
                     content = f.read()
-                # 统一处理不同字幕格式
-                if subtitle_file.endswith('.json3'):
-                    subs = json.loads(content)
-                    text = ''.join(
-                        e['segs'][0]['utf8']
-                        for e in subs['events']
-                        if e.get('segs')
-                    )
-                elif subtitle_file.endswith(('.vtt', '.srt')):
-                    text = re.sub(
-                        r'\d{2}:\d{2}:\d{2}[\.,]\d{3}.*?\n',
-                        '',
-                        content
-                    )
-                    text = re.sub(r'<.*?>|{.*?}', '', text)
-                else:
-                    text = content  # 原始内容
-                # 中文标点标准化
-                text = text.translate(str.maketrans({
-                    '，': ',',
-                    '。': '.',
-                    '；': ';',
-                    '！': '!',
-                    '？': '?',
-                    '“': '"',
-                    '”': '"',
-                    '‘': "'",
-                    '’': "'"
-                }))
                 return {
-                    "transcript": text,
-                    "detected_language": subtitle_file.split('.')[-2],
-                    "available_languages": available_subs
                 }
-            return {
-                "error": "No subtitles found",
-                "available_languages": available_subs
-            }
     except Exception as e:
-        logger.error(f"Error: {str(e)}", exc_info=True)
-        raise HTTPException(status_code=500, detail=str(e))

             'skip_download': True,
             'writesubtitles': True,
             'writeautomaticsub': True,
+            # 同时支持中英文及变体
+            'subtitleslangs': ['zh*', 'en*'],  # 包含所有中文和英语变体
             'subtitlesformat': 'best',
+            'outtmpl': '%(id)s.%(ext)s',
             'noplaylist': True,
             'cookiefile': "firefox-cookies.txt",
+            'retries': 3,
+            'sleep_interval': 10,  # 增加请求间隔
+            'ignoreerrors': True,
         }
         env_to_cookies_from_env("firefox-cookies.txt")
             info = ydl.extract_info(youtube_url, download=True)
             video_id = info['id']
+            # 语言优先级列表（中文 > 英文 > 其他）
+            priority_order = [
+                'zh', 'zh-Hans', 'zh-Hant', 'zh-CN', 'zh-TW',  # 中文
+                'en', 'en-US', 'en-GB',  # 英文
+            ]
+            # 获取所有字幕文件并按优先级排序
+            subtitle_files = []
+            for f in os.listdir('.'):
+                if f.startswith(video_id):
+                    lang_part = f.split('.')[-2]
+                    # 检查是否包含优先语言代码
+                    for lang in priority_order:
+                        if lang in lang_part:
+                            subtitle_files.append(f)
+                            break
+            # 按优先级排序文件
             subtitle_files = sorted(
+                subtitle_files,
+                key=lambda x: min(
+                    priority_order.index(lang)
+                    for lang in priority_order
+                    if lang in x
                 )
             )
             if subtitle_files:
+                selected_file = subtitle_files[0]
+                with open(selected_file, 'r', encoding='utf-8') as f:
                     content = f.read()
+                # 统一清理逻辑
+                clean_text = re.sub(
+                    r'\d{2}:\d{2}:\d{2}[\.\,]\d{3}.*?(\n|$)|'
+                    r'<.*?>|{.*?}|^WEBVTT$',
+                    '',
+                    content,
+                    flags=re.MULTILINE
+                ).strip()
                 return {
+                    "transcript": clean_text,
+                    "detected_lang": selected_file.split('.')[-2]
                 }
+            return {"error": "No subtitles found"}
     except Exception as e:
+        return {"error": str(e)}