german-asr / app.py
flozi00's picture
Update app.py
36e1b68
raw
history blame
2.63 kB
from transformers import pipeline
import torch
import gradio as gr
import subprocess
import numpy as np
import time
p = pipeline("automatic-speech-recognition", model="aware-ai/wav2vec2-base-german")
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
model='silero_vad', force_reload=False, onnx=True)
def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
"""
Helper function to read an audio file through ffmpeg.
"""
ar = f"{sampling_rate}"
ac = "1"
format_for_conversion = "f32le"
ffmpeg_command = [
"ffmpeg",
"-i",
"pipe:0",
"-ac",
ac,
"-ar",
ar,
"-f",
format_for_conversion,
"-hide_banner",
"-loglevel",
"quiet",
"pipe:1",
]
try:
with subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) as ffmpeg_process:
output_stream = ffmpeg_process.communicate(bpayload)
except FileNotFoundError as error:
raise ValueError("ffmpeg was not found but is required to load audio files from filename") from error
out_bytes = output_stream[0]
audio = np.frombuffer(out_bytes, np.float32)
if audio.shape[0] == 0:
raise ValueError("Malformed soundfile")
return audio
(get_speech_timestamps,
_, read_audio,
*_) = utils
def is_speech(wav, sr):
speech_timestamps = get_speech_timestamps(wav, model,
sampling_rate=sr)
return len(speech_timestamps) > 0
def transcribe(audio, state={"text": "", "temp_text": "", "audio": ""}):
if state is None:
state={"text": "", "temp_text": "", "audio": ""}
with open(audio, "rb") as f:
payload = f.read()
audio = ffmpeg_read(payload, sampling_rate=16000)
_sr = 16000
speech = is_speech(wav_data, _sr)
if(speech):
if(state["audio"] is ""):
state["audio"] = wav_data
else:
state["audio"] = np.concatenate((state["audio"], wav_data))
else:
if(state["audio"] is not ""):
text = p(state["audio"])["text"] + "\n"
state["temp_text"] = text
state["text"] += state["temp_text"]
state["temp_text"] = ""
state["audio"] = ""
time.sleep(0.5)
return f'{state["text"]} ( {state["temp_text"]} )', state
gr.Interface(
transcribe,
[gr.Audio(source="microphone", type="filepath", streaming=True), "state"],
[gr.Textbox(),"state"],
live=True
).launch(server_name = "0.0.0.0")