import gradio as gr import time from pathlib import Path import torchaudio from stepaudio import StepAudio from funasr import AutoModel from funasr.utils.postprocess_utils import rich_transcription_postprocess CACHE_DIR = "/tmp/gradio/" system_promtp = {"role": "system", "content": "适配用户的语言,用简短口语化的文字回答"} class CustomAsr: def __init__(self, model_name="iic/SenseVoiceSmall", device="cuda"): self.model = AutoModel( model=model_name, vad_model="fsmn-vad", vad_kwargs={"max_single_segment_time": 30000}, device=device, ) def run(self, audio_path): res = self.model.generate( input=audio_path, cache={}, language="auto", # "zh", "en", "yue", "ja", "ko", "nospeech" use_itn=True, batch_size_s=60, merge_vad=True, # merge_length_s=15, ) text = rich_transcription_postprocess(res[0]["text"]) return text def add_message(chatbot, history, mic, text): if not mic and not text: return chatbot, history, "Input is empty" if text: chatbot.append({"role": "user", "content": text}) history.append({"role": "user", "content": text}) elif mic and Path(mic).exists(): chatbot.append({"role": "user", "content": {"path": mic}}) history.append({"role": "user", "content": {"type":"audio", "audio": mic}}) print(f"{history=}") return chatbot, history, None def reset_state(): """Reset the chat history.""" return [], [system_promtp] def save_tmp_audio(audio, sr): import tempfile with tempfile.NamedTemporaryFile( dir=CACHE_DIR, delete=False, suffix=".wav" ) as temp_audio: temp_audio_path = temp_audio.name torchaudio.save(temp_audio_path, audio, sr) return temp_audio.name def predict(chatbot, history, audio_model, asr_model): """Generate a response from the model.""" try: is_input_audio = False user_audio_path = None # 检测用户输入的是音频还是文本 if isinstance(history[-1]["content"], dict): is_input_audio = True user_audio_path = history[-1]["content"]["audio"] text, audio, sr = audio_model(history, "闫雨婷") print(f"predict {text=}") audio_path = save_tmp_audio(audio, sr) # 缓存用户语音的 asr 文本结果为了加速下一次推理 if is_input_audio: asr_text = asr_model.run(user_audio_path) chatbot.append({"role": "user", "content": asr_text}) history[-1]["content"] = asr_text print(f"{asr_text=}") chatbot.append({"role": "assistant", "content": {"path": audio_path}}) chatbot.append({"role": "assistant", "content": text}) history.append({"role": "assistant", "content": text}) except Exception as e: print(e) gr.Warning(f"Some error happend, retry submit") return chatbot, history def _launch_demo(args, audio_model, asr_model): with gr.Blocks(delete_cache=(86400, 86400)) as demo: gr.Markdown("""