flozi00 commited on
Commit
412c852
1 Parent(s): c851cbc

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -0
app.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ import torch
3
+ import gradio as gr
4
+ import librosa
5
+ import numpy as np
6
+ import os
7
+
8
+
9
+ p = pipeline("automatic-speech-recognition", model="flozi00/wav2vec2-xls-r-1b-5gram-german", use_auth_token = os.environ['auth'])
10
+ model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
11
+ model='silero_vad')
12
+
13
+ (get_speech_timestamps,
14
+ _, read_audio,
15
+ *_) = utils
16
+
17
+ def is_speech(wav, sr):
18
+ speech_timestamps = get_speech_timestamps(wav, model,
19
+ sampling_rate=sr)
20
+
21
+ return len(speech_timestamps) > 0
22
+
23
+ def transcribe(audio, state={"text": "", "temp_text": "", "audio": None}):
24
+ if state is None:
25
+ state={"text": "", "temp_text": "", "audio": None}
26
+ wav_data, _sr = librosa.load(audio, sr=16000)
27
+ speech = is_speech(wav_data, _sr)
28
+ if(speech):
29
+ if(state["audio"] is None):
30
+ state["audio"] = wav_data
31
+ else:
32
+ state["audio"] = np.concatenate((state["audio"], wav_data))
33
+
34
+ text = p(state["audio"])["text"] + "\n"
35
+ state["temp_text"] = text
36
+ else:
37
+ state["text"] += state["temp_text"]
38
+ state["temp_text"] = ""
39
+ state["audio"] = None
40
+
41
+ return f'{state["text"]} ( {state["temp_text"]} )', state
42
+
43
+ gr.Interface(
44
+ fn=transcribe,
45
+ inputs=[
46
+ gr.inputs.Audio(source="microphone", type="filepath"),
47
+ "state"
48
+ ],
49
+ outputs=[
50
+ "textbox",
51
+ "state"
52
+ ],
53
+ live=True).launch()