Spaces:

megatrump
/

test-FunAudioLLM

Sleeping

App Files Files Community

megatrump commited on Mar 6

Commit

60b9834

1 Parent(s): 2711de6

统一了代码风格

Browse files

Files changed (1) hide show

api.py +131 -100

api.py CHANGED Viewed

@@ -1,8 +1,11 @@
 # coding=utf-8
 from io import BytesIO
-from typing import Optional
 from fastapi import FastAPI, File, UploadFile, HTTPException, Depends
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
@@ -20,7 +23,7 @@ import gradio as gr
 load_dotenv()
 # 获取API Token
-API_TOKEN = os.getenv("API_TOKEN")
 if not API_TOKEN:
     raise RuntimeError("API_TOKEN environment variable is not set")
@@ -52,7 +55,7 @@ model = AutoModel(
 )
 # 复用原有的格式化函数
-emo_dict = {
     "<|HAPPY|>": "😊",
     "<|SAD|>": "😔",
     "<|ANGRY|>": "😡",
@@ -62,7 +65,7 @@ emo_dict = {
     "<|SURPRISED|>": "😮",
 }
-event_dict = {
     "<|BGM|>": "🎼",
     "<|Speech|>": "",
     "<|Applause|>": "👏",
@@ -73,7 +76,7 @@ event_dict = {
     "<|Cough|>": "🤧",
 }
-emoji_dict = {
     "<|nospeech|><|Event_UNK|>": "❓",
     "<|zh|>": "",
     "<|en|>": "",
@@ -105,7 +108,7 @@ emoji_dict = {
     "<|Event_UNK|>": "",
 }
-lang_dict = {
     "<|zh|>": "<|lang|>",
     "<|en|>": "<|lang|>",
     "<|yue|>": "<|lang|>",
@@ -114,82 +117,105 @@ lang_dict = {
     "<|nospeech|>": "<|lang|>",
 }
-emo_set = {"😊", "😔", "😡", "😰", "🤢", "😮"}
-event_set = {"🎼", "👏", "😀", "😭", "🤧", "😷"}
-def format_str(s):
-    for sptk in emoji_dict:
-        s = s.replace(sptk, emoji_dict[sptk])
-    return s
-def format_str_v2(s):
-    sptk_dict = {}
-    for sptk in emoji_dict:
-        sptk_dict[sptk] = s.count(sptk)
-        s = s.replace(sptk, "")
-    emo = "<|NEUTRAL|>"
-    for e in emo_dict:
-        if sptk_dict[e] > sptk_dict[emo]:
-            emo = e
-    for e in event_dict:
-        if sptk_dict[e] > 0:
-            s = event_dict[e] + s
-    s = s + emo_dict[emo]
     for emoji in emo_set.union(event_set):
-        s = s.replace(" " + emoji, emoji)
-        s = s.replace(emoji + " ", emoji)
-    return s.strip()
-def format_str_v3(s):
-    def get_emo(s):
-        return s[-1] if s[-1] in emo_set else None
-    def get_event(s):
-        return s[0] if s[0] in event_set else None
-    s = s.replace("<|nospeech|><|Event_UNK|>", "❓")
     for lang in lang_dict:
-        s = s.replace(lang, "<|lang|>")
-    s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")]
-    new_s = " " + s_list[0]
-    cur_ent_event = get_event(new_s)
-    for i in range(1, len(s_list)):
-        if len(s_list[i]) == 0:
             continue
-        if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None:
-            s_list[i] = s_list[i][1:]
-        cur_ent_event = get_event(s_list[i])
-        if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s):
-            new_s = new_s[:-1]
-        new_s += s_list[i].strip().lstrip()
-    new_s = new_s.replace("The.", " ")
-    return new_s.strip()
 async def process_audio(audio_data: bytes, language: str = "auto") -> str:
-    """处理音频数据并返回识别结果"""
     try:
-        # 将字节数据转换为 numpy 数组
         audio_buffer = BytesIO(audio_data)
         waveform, sample_rate = torchaudio.load(audio_buffer)
-        # 转换为单声道
         if waveform.shape[0] > 1:
             waveform = waveform.mean(dim=0)
-        # 转换为 numpy array 并归一化
         input_wav = waveform.numpy().astype(np.float32)
-        # 重采样到 16kHz
         if sample_rate != 16000:
             resampler = torchaudio.transforms.Resample(sample_rate, 16000)
             input_wav = resampler(torch.from_numpy(input_wav)[None, :])[0, :].numpy()
-        # 模型推理
         text = model.generate(
             input=input_wav,
             cache={},
@@ -199,18 +225,18 @@ async def process_audio(audio_data: bytes, language: str = "auto") -> str:
             merge_vad=True
         )
-        # 格式化结果
         result = text[0]["text"]
-        result = format_str_v3(result)
         return result
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"音频处理失败：{str(e)}")
-async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
-    """验证Bearer Token"""
     if credentials.credentials != API_TOKEN:
         raise HTTPException(
             status_code=401,
@@ -225,49 +251,53 @@ async def transcribe_audio(
     model: Optional[str] = "FunAudioLLM/SenseVoiceSmall",
     language: Optional[str] = "auto",
     token: HTTPAuthorizationCredentials = Depends(verify_token)
-):
-    """音频转写接口
     Args:
-        file: 音频文件（支持常见音频格式）
-        model: 模型名称，目前仅支持 FunAudioLLM/SenseVoiceSmall
-        language: 语言代码，支持 auto/zh/en/yue/ja/ko/nospeech
     Returns:
-        {
-            "text": "识别结果",
             "error_code": 0,
             "error_msg": "",
-            "process_time": 1.234  # 处理时间（秒）
         }
     """
     start_time = time.time()
     try:
         if not file.filename.lower().endswith((".mp3", ".wav", ".flac", ".ogg", ".m4a")):
             return {
                 "text": "",
                 "error_code": 400,
-                "error_msg": "不支持的音频格式",
                 "process_time": time.time() - start_time
             }
         if model != "FunAudioLLM/SenseVoiceSmall":
             return {
                 "text": "",
                 "error_code": 400,
-                "error_msg": "不支持的模型",
                 "process_time": time.time() - start_time
             }
         if language not in ["auto", "zh", "en", "yue", "ja", "ko", "nospeech"]:
             return {
                 "text": "",
                 "error_code": 400,
-                "error_msg": "不支持的语言",
                 "process_time": time.time() - start_time
             }
         content = await file.read()
         text = await process_audio(content, language)
@@ -287,33 +317,29 @@ async def transcribe_audio(
         }
-def transcribe_audio_gradio(audio, language="auto"):
-    """Gradio界面的音频转写函数"""
     try:
         if audio is None:
-            return "请上传音频文件"
-        # 读取音频数据
-        fs, input_wav = audio
-        print('------------------------------')
-        print(fs, type(fs))
-        print(input_wav, type(input_wav))
-        print('------------------------------')
         input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
-        # 转换为单声道
         if len(input_wav.shape) > 1:
             input_wav = input_wav.mean(-1)
-        # 重采样到16kHz
-        if fs != 16000:
-            resampler = torchaudio.transforms.Resample(fs, 16000)
-            input_wav_t = torch.from_numpy(input_wav).to(torch.float32)
-            input_wav = resampler(input_wav_t[None, :])[0, :].numpy()
-        # 模型推理
         text = model.generate(
             input=input_wav,
             cache={},
@@ -323,48 +349,53 @@ def transcribe_audio_gradio(audio, language="auto"):
             merge_vad=True
         )
-        # 格式化结果
         result = text[0]["text"]
-        result = format_str_v3(result)
         return result
     except Exception as e:
-        return f"处理失败：{str(e)}"
-# 创建Gradio界面
 demo = gr.Interface(
     fn=transcribe_audio_gradio,
     inputs=[
-        gr.Audio(sources=["upload", "microphone", ], type="numpy", label="上传音频或使用麦克风录音"),
         gr.Dropdown(
             choices=["auto", "zh", "en", "yue", "ja", "ko", "nospeech"],
             value="auto",
-            label="选择语言"
         )
     ],
-    outputs=gr.Textbox(label="识别结果"),
-    title="SenseVoice 语音识别",
-    description="支持中文、英语、粤语、日语、韩语等多种语言的语音转写服务",
     examples=[
         ["examples/zh.mp3", "zh"],
         ["examples/en.mp3", "en"],
     ]
 )
-# 将Gradio应用挂载到FastAPI
 app = gr.mount_gradio_app(app, demo, path="/")
 @app.get("/docs", include_in_schema=False)
 async def custom_swagger_ui_html():
     return HTMLResponse("""
     <!DOCTYPE html>
     <html>
         <head>
-            <title>SenseVoice API 文档</title>
             <meta http-equiv="refresh" content="0;url=/docs/" />
         </head>
         <body>
-            <p>正在跳转到API文档...</p>
         </body>
     </html>
     """)

 # coding=utf-8
 from io import BytesIO
+from typing import Optional, Dict, Any, List, Set, Union, Tuple
+import os
+import time
+# Third-party imports
 from fastapi import FastAPI, File, UploadFile, HTTPException, Depends
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 load_dotenv()
 # 获取API Token
+API_TOKEN: str = os.getenv("API_TOKEN")
 if not API_TOKEN:
     raise RuntimeError("API_TOKEN environment variable is not set")
 )
 # 复用原有的格式化函数
+emotion_dict: Dict[str, str] = {
     "<|HAPPY|>": "😊",
     "<|SAD|>": "😔",
     "<|ANGRY|>": "😡",
     "<|SURPRISED|>": "😮",
 }
+event_dict: Dict[str, str] = {
     "<|BGM|>": "🎼",
     "<|Speech|>": "",
     "<|Applause|>": "👏",
     "<|Cough|>": "🤧",
 }
+emoji_dict: Dict[str, str] = {
     "<|nospeech|><|Event_UNK|>": "❓",
     "<|zh|>": "",
     "<|en|>": "",
     "<|Event_UNK|>": "",
 }
+lang_dict: Dict[str, str] = {
     "<|zh|>": "<|lang|>",
     "<|en|>": "<|lang|>",
     "<|yue|>": "<|lang|>",
     "<|nospeech|>": "<|lang|>",
 }
+emo_set: Set[str] = {"😊", "😔", "😡", "😰", "🤢", "😮"}
+event_set: Set[str] = {"🎼", "👏", "😀", "😭", "🤧", "😷"}
+def format_text_basic(text: str) -> str:
+    """Replace special tokens with corresponding emojis"""
+    for token in emoji_dict:
+        text = text.replace(token, emoji_dict[token])
+    return text
+def format_text_with_emotion(text: str) -> str:
+    """Format text with emotion and event markers"""
+    token_count: Dict[str, int] = {}
+    original_text = text
+    for token in emoji_dict:
+        token_count[token] = text.count(token)
+    # Determine dominant emotion
+    dominant_emotion = "<|NEUTRAL|>"
+    for emotion in emotion_dict:
+        if token_count[emotion] > token_count[dominant_emotion]:
+            dominant_emotion = emotion
+    # Add event markers
+    text = original_text
+    for event in event_dict:
+        if token_count[event] > 0:
+            text = event_dict[event] + text
+    # Replace all tokens with their emoji equivalents
+    for token in emoji_dict:
+        text = text.replace(token, emoji_dict[token])
+    # Add dominant emotion
+    text = text + emotion_dict[dominant_emotion]
+    # Clean up emoji spacing
     for emoji in emo_set.union(event_set):
+        text = text.replace(" " + emoji, emoji)
+        text = text.replace(emoji + " ", emoji)
+    return text.strip()
+def format_text_advanced(text: str) -> str:
+    """Advanced text formatting with multilingual and complex token handling"""
+    def get_emotion(text: str) -> Optional[str]:
+        return text[-1] if text[-1] in emo_set else None
+    def get_event(text: str) -> Optional[str]:
+        return text[0] if text[0] in event_set else None
+    # Handle special cases
+    text = text.replace("<|nospeech|><|Event_UNK|>", "❓")
     for lang in lang_dict:
+        text = text.replace(lang, "<|lang|>")
+    # Process text segments
+    text_segments: List[str] = [format_text_with_emotion(segment).strip() for segment in text.split("<|lang|>")]
+    formatted_text = " " + text_segments[0]
+    current_event = get_event(formatted_text)
+    # Merge segments
+    for i in range(1, len(text_segments)):
+        if not text_segments[i]:
             continue
+        if get_event(text_segments[i]) == current_event and get_event(text_segments[i]) is not None:
+            text_segments[i] = text_segments[i][1:]
+        current_event = get_event(text_segments[i])
+        if get_emotion(text_segments[i]) is not None and get_emotion(text_segments[i]) == get_emotion(formatted_text):
+            formatted_text = formatted_text[:-1]
+        formatted_text += text_segments[i].strip()
+    formatted_text = formatted_text.replace("The.", " ")
+    return formatted_text.strip()
 async def process_audio(audio_data: bytes, language: str = "auto") -> str:
+    """Process audio data and return transcription result"""
     try:
+        # Convert bytes to numpy array
         audio_buffer = BytesIO(audio_data)
         waveform, sample_rate = torchaudio.load(audio_buffer)
+        # Convert to mono channel
         if waveform.shape[0] > 1:
             waveform = waveform.mean(dim=0)
+        # Convert to numpy array and normalize
         input_wav = waveform.numpy().astype(np.float32)
+        # Resample to 16kHz if needed
         if sample_rate != 16000:
             resampler = torchaudio.transforms.Resample(sample_rate, 16000)
             input_wav = resampler(torch.from_numpy(input_wav)[None, :])[0, :].numpy()
+        # Model inference
         text = model.generate(
             input=input_wav,
             cache={},
             merge_vad=True
         )
+        # Format result
         result = text[0]["text"]
+        result = format_text_advanced(result)
         return result
     except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Audio processing failed: {str(e)}")
+async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)) -> HTTPAuthorizationCredentials:
+    """Verify Bearer Token authentication"""
     if credentials.credentials != API_TOKEN:
         raise HTTPException(
             status_code=401,
     model: Optional[str] = "FunAudioLLM/SenseVoiceSmall",
     language: Optional[str] = "auto",
     token: HTTPAuthorizationCredentials = Depends(verify_token)
+) -> Dict[str, Union[str, int, float]]:
+    """Audio transcription endpoint
     Args:
+        file: Audio file (supports common audio formats)
+        model: Model name, currently only supports FunAudioLLM/SenseVoiceSmall
+        language: Language code, supports auto/zh/en/yue/ja/ko/nospeech
     Returns:
+        Dict[str, Union[str, int, float]]: {
+            "text": "Transcription result",
             "error_code": 0,
             "error_msg": "",
+            "process_time": 1.234  # Processing time in seconds
         }
     """
     start_time = time.time()
     try:
+        # Validate file format
         if not file.filename.lower().endswith((".mp3", ".wav", ".flac", ".ogg", ".m4a")):
             return {
                 "text": "",
                 "error_code": 400,
+                "error_msg": "Unsupported audio format",
                 "process_time": time.time() - start_time
             }
+        # Validate model
         if model != "FunAudioLLM/SenseVoiceSmall":
             return {
                 "text": "",
                 "error_code": 400,
+                "error_msg": "Unsupported model",
                 "process_time": time.time() - start_time
             }
+        # Validate language
         if language not in ["auto", "zh", "en", "yue", "ja", "ko", "nospeech"]:
             return {
                 "text": "",
                 "error_code": 400,
+                "error_msg": "Unsupported language",
                 "process_time": time.time() - start_time
             }
+        # Process audio
         content = await file.read()
         text = await process_audio(content, language)
         }
+def transcribe_audio_gradio(audio: Optional[Tuple[int, np.ndarray]], language: str = "auto") -> str:
+    """Gradio interface for audio transcription"""
     try:
         if audio is None:
+            return "Please upload an audio file"
+        # Extract audio data
+        sample_rate, input_wav = audio
+        # Normalize audio
         input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
+        # Convert to mono
         if len(input_wav.shape) > 1:
             input_wav = input_wav.mean(-1)
+        # Resample to 16kHz if needed
+        if sample_rate != 16000:
+            resampler = torchaudio.transforms.Resample(sample_rate, 16000)
+            input_wav_tensor = torch.from_numpy(input_wav).to(torch.float32)
+            input_wav = resampler(input_wav_tensor[None, :])[0, :].numpy()
+        # Model inference
         text = model.generate(
             input=input_wav,
             cache={},
             merge_vad=True
         )
+        # Format result
         result = text[0]["text"]
+        result = format_text_advanced(result)
         return result
     except Exception as e:
+        return f"Processing failed: {str(e)}"
+# Create Gradio interface with localized labels
 demo = gr.Interface(
     fn=transcribe_audio_gradio,
     inputs=[
+        gr.Audio(
+            sources=["upload", "microphone"],
+            type="numpy",
+            label="Upload audio or record from microphone"
+        ),
         gr.Dropdown(
             choices=["auto", "zh", "en", "yue", "ja", "ko", "nospeech"],
             value="auto",
+            label="Select Language"
         )
     ],
+    outputs=gr.Textbox(label="Recognition Result"),
+    title="SenseVoice Speech Recognition",
+    description="Multi-language speech transcription service supporting Chinese, English, Cantonese, Japanese, and Korean",
     examples=[
         ["examples/zh.mp3", "zh"],
         ["examples/en.mp3", "en"],
     ]
 )
+# Mount Gradio app to FastAPI
 app = gr.mount_gradio_app(app, demo, path="/")
+# Custom Swagger UI redirect
 @app.get("/docs", include_in_schema=False)
 async def custom_swagger_ui_html():
     return HTMLResponse("""
     <!DOCTYPE html>
     <html>
         <head>
+            <title>SenseVoice API Documentation</title>
             <meta http-equiv="refresh" content="0;url=/docs/" />
         </head>
         <body>
+            <p>Redirecting to API documentation...</p>
         </body>
     </html>
     """)