from transformers import pipeline import torch import gradio as gr import librosa import numpy as np import os p = pipeline("automatic-speech-recognition", model="flozi00/wav2vec2-large-xlsr-53-german-with-lm") model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad') (get_speech_timestamps, _, read_audio, *_) = utils def is_speech(wav, sr): speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=sr) return len(speech_timestamps) > 0 def transcribe(audio, state={"text": "", "temp_text": "", "audio": None}): if state is None: state={"text": "", "temp_text": "", "audio": None} wav_data, _sr = librosa.load(audio, sr=16000) speech = is_speech(wav_data, _sr) if(speech): if(state["audio"] is None): state["audio"] = wav_data else: state["audio"] = np.concatenate((state["audio"], wav_data)) text = p(state["audio"])["text"] + "\n" state["temp_text"] = text else: state["text"] += state["temp_text"] state["temp_text"] = "" state["audio"] = None return f'{state["text"]} ( {state["temp_text"]} )', state gr.Interface( fn=transcribe, inputs=[ gr.inputs.Audio(source="microphone", type="filepath"), "state" ], outputs=[ "textbox", "state" ], live=True).launch()