from transformers import pipeline import torch import gradio as gr import librosa import numpy as np p = pipeline("automatic-speech-recognition", model="aware-ai/wav2vec2-base-german") model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad') (get_speech_timestamps, _, read_audio, *_) = utils def is_speech(wav, sr): speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=sr) return len(speech_timestamps) > 0 def transcribe(audio, state={"text": "", "temp_text": "", "audio": None}): if state is None: state={"text": "", "temp_text": "", "audio": None} wav_data, _sr = librosa.load(audio, sr=16000) speech = is_speech(wav_data, _sr) if(speech): if(state["audio"] is None): state["audio"] = wav_data else: state["audio"] = np.concatenate((state["audio"], wav_data)) text = p(state["audio"])["text"] + "\n" state["temp_text"] = text else: state["text"] += state["temp_text"] state["temp_text"] = "" state["audio"] = None return f'{state["text"]} ( {state["temp_text"]} )', state gr.Interface( transcribe, [gr.Audio(source="microphone", type="filepath", streaming=True), "state"], [gr.Textbox(),"state"], live=True ).launch(server_name = "0.0.0.0")