Spaces:

Chrunos
/

mmmm

Running

App Files Files Community

Chrunos commited on Mar 22

Commit

942ffc3

verified ·

1 Parent(s): 26dfd2b

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -29

app.py CHANGED Viewed

@@ -134,54 +134,93 @@ async def get_transcript(youtube_url: str):
             'skip_download': True,
             'writesubtitles': True,
             'writeautomaticsub': True,
-            'subtitleslangs': ['en'],
             'subtitlesformat': 'best',
-            'outtmpl': '%(id)s.%(ext)s',
             'noplaylist': True,
-            'cookiefile': "firefox-cookies.txt"
         }
         env_to_cookies_from_env("firefox-cookies.txt")
-        # Show current directory structure before download
-        logger.info(f"Current directory files (before): {os.listdir('.')}")
         with yt_dlp.YoutubeDL(ydl_opts) as ydl:
             info = ydl.extract_info(youtube_url, download=True)
             video_id = info['id']
-            logger.info(f"Video ID: {video_id}")
-            # Check actual downloaded files
-            logger.info(f"Current directory files (after extraction): {os.listdir('.')}")
-            # Search for subtitle files pattern
-            subtitle_files = [f for f in os.listdir('.')
-                             if f.startswith(video_id) and ('en' in f)]
-            logger.info(f"Potential subtitle files: {subtitle_files}")
             if subtitle_files:
-                # Process the first found subtitle file
                 subtitle_file = subtitle_files[0]
-                logger.info(f"Processing subtitle file: {subtitle_file}")
                 with open(subtitle_file, 'r', encoding='utf-8') as f:
                     content = f.read()
-                # Add format-specific parsing
                 if subtitle_file.endswith('.json3'):
-                    import json
                     subs = json.loads(content)
-                    text = ' '.join([e['segs'][0]['utf8'] for e in subs['events'] if e.get('segs')])
-                elif subtitle_file.endswith('.vtt'):
-                    text = ' '.join(line.strip() for line in content.split('\n')
-                           if not line.startswith('WEBVTT')
-                           and '-->' not in line
-                           and not line.strip().isdigit())
                 else:
-                    text = f"Unsupported format: {subtitle_file}"
-                return {"transcript": text}
-            return {"transcript": f"No subtitle files found for {video_id}"}
     except Exception as e:
         logger.error(f"Error: {str(e)}", exc_info=True)

             'skip_download': True,
             'writesubtitles': True,
             'writeautomaticsub': True,
+            # 扩展支持的语言列表（包含中英文）
+            'subtitleslangs': ['en', 'zh', 'zh-Hans', 'zh-Hant', 'zh-CN', 'zh-TW', 'a.*'],
+            # 自动选择最佳字幕格式
             'subtitlesformat': 'best',
+            # 包含原始语言代码在文件名中
+            'outtmpl': '%(id)s.%(ext)s',
             'noplaylist': True,
+            'cookiefile': "firefox-cookies.txt",
+            # 添加中文字幕兼容性参数
+            'compat_opts': {'no-youtube-unavailable-videos'},
         }
         env_to_cookies_from_env("firefox-cookies.txt")
         with yt_dlp.YoutubeDL(ydl_opts) as ydl:
             info = ydl.extract_info(youtube_url, download=True)
             video_id = info['id']
+            # 获取实际可用的字幕语言列表
+            available_subs = []
+            for sub_type in ['subtitles', 'automatic_captions']:
+                subs = info.get(sub_type, {})
+                available_subs.extend([
+                    f"{lang}.{track[0]['ext']}"
+                    for lang, track in subs.items()
+                ])
+            # 优先顺序：中文 > 英文 > 其他语言
+            priority_langs = ['zh', 'en']
+            subtitle_files = sorted(
+                [f for f in os.listdir('.') if f.startswith(video_id)],
+                key=lambda x: (
+                    -max([x.find(lang) for lang in priority_langs]),
+                    len(x)
+                )
+            )
+            logger.info(f"Detected subtitle files: {subtitle_files}")
             if subtitle_files:
                 subtitle_file = subtitle_files[0]
+                logger.info(f"Selected subtitle: {subtitle_file}")
                 with open(subtitle_file, 'r', encoding='utf-8') as f:
                     content = f.read()
+                # 统一处理不同字幕格式
                 if subtitle_file.endswith('.json3'):
                     subs = json.loads(content)
+                    text = ''.join(
+                        e['segs'][0]['utf8']
+                        for e in subs['events']
+                        if e.get('segs')
+                    )
+                elif subtitle_file.endswith(('.vtt', '.srt')):
+                    text = re.sub(
+                        r'\d{2}:\d{2}:\d{2}[\.,]\d{3}.*?\n',
+                        '',
+                        content
+                    )
+                    text = re.sub(r'<.*?>|{.*?}', '', text)
                 else:
+                    text = content  # 原始内容
+                # 中文标点标准化
+                text = text.translate(str.maketrans({
+                    '，': ',',
+                    '。': '.',
+                    '；': ';',
+                    '！': '!',
+                    '？': '?',
+                    '“': '"',
+                    '”': '"',
+                    '‘': "'",
+                    '’': "'"
+                }))
+                return {
+                    "transcript": text,
+                    "detected_language": subtitle_file.split('.')[-2],
+                    "available_languages": available_subs
+                }
+            return {
+                "error": "No subtitles found",
+                "available_languages": available_subs
+            }
     except Exception as e:
         logger.error(f"Error: {str(e)}", exc_info=True)