Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import pipeline
|
2 |
+
import torch
|
3 |
+
import gradio as gr
|
4 |
+
import librosa
|
5 |
+
import numpy as np
|
6 |
+
import os
|
7 |
+
|
8 |
+
|
9 |
+
p = pipeline("automatic-speech-recognition", model="flozi00/wav2vec2-xls-r-1b-5gram-german", use_auth_token = os.environ['auth'])
|
10 |
+
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
|
11 |
+
model='silero_vad')
|
12 |
+
|
13 |
+
(get_speech_timestamps,
|
14 |
+
_, read_audio,
|
15 |
+
*_) = utils
|
16 |
+
|
17 |
+
def is_speech(wav, sr):
|
18 |
+
speech_timestamps = get_speech_timestamps(wav, model,
|
19 |
+
sampling_rate=sr)
|
20 |
+
|
21 |
+
return len(speech_timestamps) > 0
|
22 |
+
|
23 |
+
def transcribe(audio, state={"text": "", "temp_text": "", "audio": None}):
|
24 |
+
if state is None:
|
25 |
+
state={"text": "", "temp_text": "", "audio": None}
|
26 |
+
wav_data, _sr = librosa.load(audio, sr=16000)
|
27 |
+
speech = is_speech(wav_data, _sr)
|
28 |
+
if(speech):
|
29 |
+
if(state["audio"] is None):
|
30 |
+
state["audio"] = wav_data
|
31 |
+
else:
|
32 |
+
state["audio"] = np.concatenate((state["audio"], wav_data))
|
33 |
+
|
34 |
+
text = p(state["audio"])["text"] + "\n"
|
35 |
+
state["temp_text"] = text
|
36 |
+
else:
|
37 |
+
state["text"] += state["temp_text"]
|
38 |
+
state["temp_text"] = ""
|
39 |
+
state["audio"] = None
|
40 |
+
|
41 |
+
return f'{state["text"]} ( {state["temp_text"]} )', state
|
42 |
+
|
43 |
+
gr.Interface(
|
44 |
+
fn=transcribe,
|
45 |
+
inputs=[
|
46 |
+
gr.inputs.Audio(source="microphone", type="filepath"),
|
47 |
+
"state"
|
48 |
+
],
|
49 |
+
outputs=[
|
50 |
+
"textbox",
|
51 |
+
"state"
|
52 |
+
],
|
53 |
+
live=True).launch()
|