Spaces:

megatrump
/

test-FunAudioLLM

Running

App Files Files Community

Chenhao commited on Mar 7

Commit

660c142

1 Parent(s): 5899d37

Format the code with claude 3.5

Browse files

Files changed (4) hide show

.gitignore +5 -0
api.py +143 -111
start.sh +3 -1
test/01_rpc_test.py +67 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+.venv/
+.vscode/
+*.pyc

api.py CHANGED Viewed

@@ -33,7 +33,7 @@ security = HTTPBearer()
 app = FastAPI(
     title="SenseVoice API",
-    description="语音识别 API 服务",
     version="1.0.0"
 )
@@ -55,78 +55,77 @@ model = AutoModel(
     device="cuda"
 )
-# 复用原有的格式化函数
 emotion_dict: Dict[str, str] = {
-    "<|HAPPY|>": "😊",
-    "<|SAD|>": "😔",
-    "<|ANGRY|>": "😡",
-    "<|NEUTRAL|>": "",
-    "<|FEARFUL|>": "😰",
-    "<|DISGUSTED|>": "🤢",
-    "<|SURPRISED|>": "😮",
 }
 event_dict: Dict[str, str] = {
-    "<|BGM|>": "🎼",
-    "<|Speech|>": "",
-    "<|Applause|>": "👏",
-    "<|Laughter|>": "😀",
-    "<|Cry|>": "😭",
-    "<|Sneeze|>": "🤧",
-    "<|Breath|>": "",
-    "<|Cough|>": "🤧",
 }
 emoji_dict: Dict[str, str] = {
     "<|nospeech|><|Event_UNK|>": "❓",
-    "<|zh|>": "",
-    "<|en|>": "",
-    "<|yue|>": "",
-    "<|ja|>": "",
-    "<|ko|>": "",
-    "<|nospeech|>": "",
-    "<|HAPPY|>": "😊",
-    "<|SAD|>": "😔",
-    "<|ANGRY|>": "😡",
-    "<|NEUTRAL|>": "",
-    "<|BGM|>": "🎼",
-    "<|Speech|>": "",
-    "<|Applause|>": "👏",
-    "<|Laughter|>": "😀",
-    "<|FEARFUL|>": "😰",
-    "<|DISGUSTED|>": "🤢",
-    "<|SURPRISED|>": "😮",
-    "<|Cry|>": "😭",
-    "<|EMO_UNKNOWN|>": "",
-    "<|Sneeze|>": "🤧",
-    "<|Breath|>": "",
-    "<|Cough|>": "😷",
-    "<|Sing|>": "",
     "<|Speech_Noise|>": "",
-    "<|withitn|>": "",
-    "<|woitn|>": "",
-    "<|GBG|>": "",
-    "<|Event_UNK|>": "",
 }
 lang_dict: Dict[str, str] = {
-    "<|zh|>": "<|lang|>",
-    "<|en|>": "<|lang|>",
-    "<|yue|>": "<|lang|>",
-    "<|ja|>": "<|lang|>",
-    "<|ko|>": "<|lang|>",
-    "<|nospeech|>": "<|lang|>",
 }
 emo_set: Set[str] = {"😊", "😔", "😡", "😰", "🤢", "😮"}
 event_set: Set[str] = {"🎼", "👏", "😀", "😭", "🤧", "😷"}
-def format_text_basic(text: str) -> str:
-    """Replace special tokens with corresponding emojis"""
-    for token in emoji_dict:
-        text = text.replace(token, emoji_dict[token])
-    return text
 def format_text_with_emotion(text: str) -> str:
@@ -198,53 +197,90 @@ def format_text_advanced(text: str) -> str:
 async def audio_stt(audio: torch.Tensor, sample_rate: int, language: str = "auto") -> str:
-    """ Audio as an already normalized Float32 Tensor
     """
-    # Step 01. Normalize
-    input_wav = audio.to(torch.float32)
-    # Step 02. Convert audio to mono channel
-    if len(input_wav.shape) > 1:
-        input_wav = input_wav.mean(dim=0)
-    input_wav = input_wav.squeeze()
-    # Step 03. Resample to 16kHz
-    if sample_rate != 16000:
-        resampler = torchaudio.transforms.Resample(sample_rate, 16000)
-        input_wav = resampler(input_wav[None, :])[0, :].numpy()
-    # Step 04. Model Inference
-    text = model.generate(
-        input=input_wav,
-        cache={},
-        language=language,
-        use_itn=True,
-        batch_size_s=500,
-        merge_vad=True
-    )
-    # Step 05. Format Result
-    result = text[0]["text"]
-    result = format_text_advanced(result)
-    return result
 async def process_audio(audio_data: bytes, language: str = "auto") -> str:
-    """Process audio data and return transcription result"""
     try:
-        # Convert bytes to numpy array
         audio_buffer = BytesIO(audio_data)
         waveform, sample_rate = torchaudio.load(
-            uri             = audio_buffer,
-            normalize       = True,
-            channels_first  = True,
         )
         result = await audio_stt(waveform, sample_rate, language)
         return result
     except Exception as e:
-        import traceback
-        traceback.print_exc()
-        traceback.print_stack()
-        raise HTTPException(status_code=500, detail=f"Audio processing failed: {str(e)}")
 async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)) -> HTTPAuthorizationCredentials:
@@ -260,56 +296,52 @@ async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(secur
 @app.post("/v1/audio/transcriptions")
 async def transcribe_audio(
     file: UploadFile = File(...),
-    model: Optional[str] = "FunAudioLLM/SenseVoiceSmall",
-    language: Optional[str] = "auto",
     token: HTTPAuthorizationCredentials = Depends(verify_token)
 ) -> Dict[str, Union[str, int, float]]:
-    """Audio transcription endpoint
     Args:
-        file: Audio file (supports common audio formats)
-        model: Model name, currently only supports FunAudioLLM/SenseVoiceSmall
-        language: Language code, supports auto/zh/en/yue/ja/ko/nospeech
     Returns:
-        Dict[str, Union[str, int, float]]: {
-            "text": "Transcription result",
-            "error_code": 0,
-            "error_msg": "",
-            "process_time": 1.234  # Processing time in seconds
-        }
     """
     start_time = time.time()
     try:
-        # Validate file format
         if not file.filename.lower().endswith((".mp3", ".wav", ".flac", ".ogg", ".m4a")):
             return {
                 "text": "",
                 "error_code": 400,
-                "error_msg": "Unsupported audio format",
                 "process_time": time.time() - start_time
             }
-        # Validate model
         if model != "FunAudioLLM/SenseVoiceSmall":
             return {
                 "text": "",
                 "error_code": 400,
-                "error_msg": "Unsupported model",
                 "process_time": time.time() - start_time
             }
-        # Validate language
         if language not in ["auto", "zh", "en", "yue", "ja", "ko", "nospeech"]:
             return {
                 "text": "",
                 "error_code": 400,
-                "error_msg": "Unsupported language",
                 "process_time": time.time() - start_time
             }
-        # Process audio
         content = await file.read()
         text = await process_audio(content, language)
@@ -341,8 +373,8 @@ def transcribe_audio_gradio(audio: Optional[Tuple[int, np.ndarray]], language: s
         # Normalize audio
         input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
         input_wav = torch.from_numpy(input_wav)
         result = asyncio.run(audio_stt(input_wav, sample_rate, language))
         return result

 app = FastAPI(
     title="SenseVoice API",
+    description="Speech To Text API Service",
     version="1.0.0"
 )
     device="cuda"
 )
 emotion_dict: Dict[str, str] = {
+    "<|HAPPY|>":        "😊",
+    "<|SAD|>":          "😔",
+    "<|ANGRY|>":        "😡",
+    "<|NEUTRAL|>":      "",
+    "<|FEARFUL|>":      "😰",
+    "<|DISGUSTED|>":    "🤢",
+    "<|SURPRISED|>":    "😮",
 }
 event_dict: Dict[str, str] = {
+    "<|BGM|>":          "🎼",
+    "<|Speech|>":       "",
+    "<|Applause|>":     "👏",
+    "<|Laughter|>":     "😀",
+    "<|Cry|>":          "😭",
+    "<|Sneeze|>":       "🤧",
+    "<|Breath|>":       "",
+    "<|Cough|>":        "🤧",
 }
 emoji_dict: Dict[str, str] = {
     "<|nospeech|><|Event_UNK|>": "❓",
+    "<|zh|>":           "",
+    "<|en|>":           "",
+    "<|yue|>":          "",
+    "<|ja|>":           "",
+    "<|ko|>":           "",
+    "<|nospeech|>":     "",
+    "<|HAPPY|>":        "😊",
+    "<|SAD|>":          "😔",
+    "<|ANGRY|>":        "😡",
+    "<|NEUTRAL|>":      "",
+    "<|BGM|>":          "🎼",
+    "<|Speech|>":       "",
+    "<|Applause|>":     "👏",
+    "<|Laughter|>":     "😀",
+    "<|FEARFUL|>":      "😰",
+    "<|DISGUSTED|>":    "🤢",
+    "<|SURPRISED|>":    "😮",
+    "<|Cry|>":          "😭",
+    "<|EMO_UNKNOWN|>":  "",
+    "<|Sneeze|>":       "🤧",
+    "<|Breath|>":       "",
+    "<|Cough|>":        "😷",
+    "<|Sing|>":         "",
     "<|Speech_Noise|>": "",
+    "<|withitn|>":      "",
+    "<|woitn|>":        "",
+    "<|GBG|>":          "",
+    "<|Event_UNK|>":    "",
 }
 lang_dict: Dict[str, str] = {
+    "<|zh|>":           "<|lang|>",
+    "<|en|>":           "<|lang|>",
+    "<|yue|>":          "<|lang|>",
+    "<|ja|>":           "<|lang|>",
+    "<|ko|>":           "<|lang|>",
+    "<|nospeech|>":     "<|lang|>",
 }
 emo_set: Set[str] = {"😊", "😔", "😡", "😰", "🤢", "😮"}
 event_set: Set[str] = {"🎼", "👏", "😀", "😭", "🤧", "😷"}
+# def format_text_basic(text: str) -> str:
+#     """Replace special tokens with corresponding emojis"""
+#     for token in emoji_dict:
+#         text = text.replace(token, emoji_dict[token])
+#     return text
 def format_text_with_emotion(text: str) -> str:
 async def audio_stt(audio: torch.Tensor, sample_rate: int, language: str = "auto") -> str:
+    """Process audio tensor and perform speech-to-text conversion.
+    Args:
+        audio: Input audio tensor
+        sample_rate: Audio sample rate in Hz
+        language: Target language code (auto/zh/en/yue/ja/ko/nospeech)
+    Returns:
+        str: Transcribed and formatted text result
     """
+    try:
+        # Normalize
+        if audio.dtype != torch.float32:
+            if audio.dtype == torch.int16:
+                audio = audio.float() / torch.iinfo(torch.int16).max
+            elif audio.dtype == torch.int32:
+                audio = audio.float() / torch.iinfo(torch.int32).max
+            else:
+                audio = audio.float()
+        # Make sure audio in correct range
+        if audio.abs().max() > 1.0:
+            audio = audio / audio.abs().max()
+        # Convert to mono channel
+        if len(audio.shape) > 1:
+            audio = audio.mean(dim=0)
+        audio = audio.squeeze()
+        # Resample
+        if sample_rate != 16000:
+            resampler = torchaudio.transforms.Resample(
+                orig_freq=sample_rate,
+                new_freq=16000
+            )
+            audio = resampler(audio.unsqueeze(0)).squeeze(0)
+        text = model.generate(
+            input=audio,
+            cache={},
+            language=language,
+            use_itn=True,
+            batch_size_s=500,
+            merge_vad=True
+        )
+        # 格式化结果
+        result = text[0]["text"]
+        return format_text_advanced(result)
+    except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Audio processing failed in audio_stt: {str(e)}"
+        )
 async def process_audio(audio_data: bytes, language: str = "auto") -> str:
+    """Process audio data and return transcription result.
+    Args:
+        audio_data: Raw audio data in bytes
+        language: Target language code
+    Returns:
+        str: Transcribed and formatted text
+    Raises:
+        HTTPException: If audio processing fails
+    """
     try:
         audio_buffer = BytesIO(audio_data)
         waveform, sample_rate = torchaudio.load(
+            uri=audio_buffer,
+            normalize=True,
+            channels_first=True
         )
         result = await audio_stt(waveform, sample_rate, language)
         return result
     except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Audio processing failed: {str(e)}"
+        )
 async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)) -> HTTPAuthorizationCredentials:
 @app.post("/v1/audio/transcriptions")
 async def transcribe_audio(
     file: UploadFile = File(...),
+    model: str = "FunAudioLLM/SenseVoiceSmall",
+    language: str = "auto",
     token: HTTPAuthorizationCredentials = Depends(verify_token)
 ) -> Dict[str, Union[str, int, float]]:
+    """Audio transcription endpoint.
     Args:
+        file: Audio file (supports mp3, wav, flac, ogg, m4a)
+        model: Model name
+        language: Language code
+        token: Authentication token
     Returns:
+        Dict containing transcription result and metadata
     """
     start_time = time.time()
     try:
+        # Check the file format
         if not file.filename.lower().endswith((".mp3", ".wav", ".flac", ".ogg", ".m4a")):
             return {
                 "text": "",
                 "error_code": 400,
+                "error_msg": "不支持的音频格式",
                 "process_time": time.time() - start_time
             }
+        # Check the model
         if model != "FunAudioLLM/SenseVoiceSmall":
             return {
                 "text": "",
                 "error_code": 400,
+                "error_msg": "不支持的模型",
                 "process_time": time.time() - start_time
             }
+        # Check the language
         if language not in ["auto", "zh", "en", "yue", "ja", "ko", "nospeech"]:
             return {
                 "text": "",
                 "error_code": 400,
+                "error_msg": "不支持的语言",
                 "process_time": time.time() - start_time
             }
+        # STT
         content = await file.read()
         text = await process_audio(content, language)
         # Normalize audio
         input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
+        # Model Inference
         input_wav = torch.from_numpy(input_wav)
         result = asyncio.run(audio_stt(input_wav, sample_rate, language))
         return result

start.sh CHANGED Viewed

@@ -1,7 +1,9 @@
 #!/bin/bash
 # Keep Alive
 python3 awake.py &
 # 启动FastAPI服务
-python -m uvicorn api:app --host 0.0.0.0 --port 7860

 #!/bin/bash
+export API_TOKEN=your-secret-token-here
 # Keep Alive
 python3 awake.py &
 # 启动FastAPI服务
+python -m uvicorn api:app --host 0.0.0.0 --port 8000

test/01_rpc_test.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import asyncio
+import httpx
+from pathlib import Path
+from typing import Optional
+async def transcribe_audio(
+    file_path: str,
+    api_token: str,
+    model: str = "FunAudioLLM/SenseVoiceSmall",
+    api_url: str = "http://127.0.0.1:8000/v1/audio/transcriptions"
+) -> Optional[dict]:
+    """异步发送语音识别请求
+    Args:
+        file_path: 音频文件路径
+        api_token: API 认证令牌
+        model: 模型名称，默认为 FunAudioLLM/SenseVoiceSmall
+        api_url: API 服务地址
+    Returns:
+        dict: 包含识别结果的字典，失败时返回 None
+    """
+    try:
+        # 检查文件是否存在
+        audio_file = Path(file_path)
+        if not audio_file.exists():
+            print(f"错误：文件 {file_path} 不存在")
+            return None
+        # 准备请求头和文件
+        headers = {"Authorization": f"Bearer {api_token}"}
+        files = {
+            "file": (audio_file.name, audio_file.open("rb")),
+            "model": (None, model)
+        }
+        # 发送异步请求
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                api_url,
+                headers=headers,
+                files=files,
+                timeout=60,
+            )
+            print(response.text)
+            response.raise_for_status()
+            return response.json()
+    except httpx.HTTPError as e:
+        print(f"HTTP 请求错误：{str(e)}")
+        return None
+    except Exception as e:
+        print(f"发生错误：{str(e)}")
+        return None
+async def main():
+    # 使用示例
+    file_path = "../examples/zh.mp3"
+    api_token = "your-secret-token-here"
+    result = await transcribe_audio(file_path, api_token)
+    if result:
+        print(f"识别结果：{result['text']}")
+if __name__ == "__main__":
+    asyncio.run(main())