from transformers import pipeline import torch import gradio as gr import subprocess import numpy as np import time p = pipeline("automatic-speech-recognition", model="aware-ai/wav2vec2-base-german") model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad', force_reload=False, onnx=True) def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array: """ Helper function to read an audio file through ffmpeg. """ ar = f"{sampling_rate}" ac = "1" format_for_conversion = "f32le" ffmpeg_command = [ "ffmpeg", "-i", "pipe:0", "-ac", ac, "-ar", ar, "-f", format_for_conversion, "-hide_banner", "-loglevel", "quiet", "pipe:1", ] try: with subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) as ffmpeg_process: output_stream = ffmpeg_process.communicate(bpayload) except FileNotFoundError as error: raise ValueError("ffmpeg was not found but is required to load audio files from filename") from error out_bytes = output_stream[0] audio = np.frombuffer(out_bytes, np.float32) if audio.shape[0] == 0: raise ValueError("Malformed soundfile") return audio (get_speech_timestamps, _, read_audio, *_) = utils def is_speech(wav, sr): speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=sr) return len(speech_timestamps) > 0 def transcribe(audio, state={"text": "", "temp_text": "", "audio": ""}): if state is None: state={"text": "", "temp_text": "", "audio": ""} with open(audio, "rb") as f: payload = f.read() wav_data = ffmpeg_read(payload, sampling_rate=16000) _sr = 16000 speech = is_speech(wav_data, _sr) if(speech): if(state["audio"] is ""): state["audio"] = wav_data else: state["audio"] = np.concatenate((state["audio"], wav_data)) else: if(state["audio"] is not ""): text = p(state["audio"])["text"] + "\n" state["temp_text"] = text state["text"] += state["temp_text"] state["temp_text"] = "" state["audio"] = "" time.sleep(0.5) return f'{state["text"]} ( {state["temp_text"]} )', state gr.Interface( transcribe, [gr.Audio(source="microphone", type="filepath", streaming=True), "state"], [gr.Textbox(),"state"], live=True ).launch(server_name = "0.0.0.0")