Spaces:
Runtime error
Runtime error
File size: 2,635 Bytes
25fcb65 2302e12 412c852 2302e12 922cd73 2302e12 793e132 2302e12 7770adb 2302e12 99ac344 2302e12 7226ea6 2302e12 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
from transformers import pipeline
import torch
import gradio as gr
import subprocess
import numpy as np
import time
p = pipeline("automatic-speech-recognition", model="aware-ai/wav2vec2-base-german")
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
model='silero_vad', force_reload=False, onnx=True)
def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
"""
Helper function to read an audio file through ffmpeg.
"""
ar = f"{sampling_rate}"
ac = "1"
format_for_conversion = "f32le"
ffmpeg_command = [
"ffmpeg",
"-i",
"pipe:0",
"-ac",
ac,
"-ar",
ar,
"-f",
format_for_conversion,
"-hide_banner",
"-loglevel",
"quiet",
"pipe:1",
]
try:
with subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) as ffmpeg_process:
output_stream = ffmpeg_process.communicate(bpayload)
except FileNotFoundError as error:
raise ValueError("ffmpeg was not found but is required to load audio files from filename") from error
out_bytes = output_stream[0]
audio = np.frombuffer(out_bytes, np.float32)
if audio.shape[0] == 0:
raise ValueError("Malformed soundfile")
return audio
(get_speech_timestamps,
_, read_audio,
*_) = utils
def is_speech(wav, sr):
speech_timestamps = get_speech_timestamps(wav, model,
sampling_rate=sr)
return len(speech_timestamps) > 0
def transcribe(audio, state={"text": "", "temp_text": "", "audio": ""}):
if state is None:
state={"text": "", "temp_text": "", "audio": ""}
with open(audio, "rb") as f:
payload = f.read()
wav_data = ffmpeg_read(payload, sampling_rate=16000)
_sr = 16000
speech = is_speech(wav_data, _sr)
if(speech):
if(state["audio"] is ""):
state["audio"] = wav_data
else:
state["audio"] = np.concatenate((state["audio"], wav_data))
else:
if(state["audio"] is not ""):
text = p(state["audio"])["text"] + "\n"
state["temp_text"] = text
state["text"] += state["temp_text"]
state["temp_text"] = ""
state["audio"] = ""
time.sleep(0.5)
return f'{state["text"]} ( {state["temp_text"]} )', state
gr.Interface(
transcribe,
[gr.Audio(source="microphone", type="filepath", streaming=True), "state"],
[gr.Textbox(),"state"],
live=True
).launch(server_name = "0.0.0.0") |