Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -8,9 +8,9 @@ import librosa
|
|
8 |
import time
|
9 |
|
10 |
|
11 |
-
lmID = "aware-ai/german-lowercase-
|
12 |
decoder = BeamSearchDecoderCTC.load_from_hf_hub(lmID)
|
13 |
-
p = pipeline("automatic-speech-recognition", model="aware-ai/robust-wav2vec2-
|
14 |
ttp = pipeline("text2text-generation", model="aware-ai/marian-german-grammar")
|
15 |
|
16 |
vadmodel, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
|
@@ -36,24 +36,28 @@ def translate(src, tgt, text):
|
|
36 |
return result
|
37 |
|
38 |
def transcribe(audio):
|
|
|
39 |
sampling_rate = 16000
|
40 |
start_time = time.time()
|
41 |
audio, sr = librosa.load(audio, sr=sampling_rate)
|
42 |
-
|
43 |
start_time = time.time()
|
44 |
speech_timestamps = get_speech_timestamps(audio, vadmodel, sampling_rate=sampling_rate)
|
45 |
-
|
46 |
start_time = time.time()
|
47 |
chunks = [audio[i["start"]:i["end"]] for i in speech_timestamps]
|
48 |
-
|
49 |
start_time = time.time()
|
50 |
transcribed = " ".join([text["text"] for text in p(chunks, chunk_length_s=20, stride_length_s=(0, 0))])
|
51 |
-
|
52 |
start_time = time.time()
|
53 |
punctuated = ttp(transcribed, max_length = 512)[0]["generated_text"]
|
54 |
-
|
|
|
|
|
|
|
55 |
|
56 |
-
return transcribed, punctuated
|
57 |
|
58 |
def get_asr_interface():
|
59 |
return gr.Interface(
|
@@ -62,6 +66,7 @@ def get_asr_interface():
|
|
62 |
gr.inputs.Audio(source="microphone", type="filepath")
|
63 |
],
|
64 |
outputs=[
|
|
|
65 |
"textbox",
|
66 |
"textbox"
|
67 |
])
|
|
|
8 |
import time
|
9 |
|
10 |
|
11 |
+
lmID = "aware-ai/german-lowercase-4gram-kenlm"
|
12 |
decoder = BeamSearchDecoderCTC.load_from_hf_hub(lmID)
|
13 |
+
p = pipeline("automatic-speech-recognition", model="aware-ai/robust-wav2vec2-base-german-lowercase", decoder=decoder)
|
14 |
ttp = pipeline("text2text-generation", model="aware-ai/marian-german-grammar")
|
15 |
|
16 |
vadmodel, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
|
|
|
36 |
return result
|
37 |
|
38 |
def transcribe(audio):
|
39 |
+
log = ""
|
40 |
sampling_rate = 16000
|
41 |
start_time = time.time()
|
42 |
audio, sr = librosa.load(audio, sr=sampling_rate)
|
43 |
+
log += "--- %s seconds audio loading ---" + str(time.time() - start_time)
|
44 |
start_time = time.time()
|
45 |
speech_timestamps = get_speech_timestamps(audio, vadmodel, sampling_rate=sampling_rate)
|
46 |
+
log += "\n--- %s seconds audio timestamps---" + str(time.time() - start_time)
|
47 |
start_time = time.time()
|
48 |
chunks = [audio[i["start"]:i["end"]] for i in speech_timestamps]
|
49 |
+
log += "\n--- %s seconds audio chunking---" + str(time.time() - start_time)
|
50 |
start_time = time.time()
|
51 |
transcribed = " ".join([text["text"] for text in p(chunks, chunk_length_s=20, stride_length_s=(0, 0))])
|
52 |
+
log += "\n--- %s seconds audio transcription ---" + str(time.time() - start_time)
|
53 |
start_time = time.time()
|
54 |
punctuated = ttp(transcribed, max_length = 512)[0]["generated_text"]
|
55 |
+
log += "\n--- %s seconds audio formatting ---" + str(time.time() - start_time)
|
56 |
+
start_time = time.time()
|
57 |
+
p(audio, chunk_length_s=20, stride_length_s=(0, 0))
|
58 |
+
log += "\n--- %s seconds full asr ---" + str(time.time() - start_time)
|
59 |
|
60 |
+
return transcribed, punctuated, log
|
61 |
|
62 |
def get_asr_interface():
|
63 |
return gr.Interface(
|
|
|
66 |
gr.inputs.Audio(source="microphone", type="filepath")
|
67 |
],
|
68 |
outputs=[
|
69 |
+
"textbox",
|
70 |
"textbox",
|
71 |
"textbox"
|
72 |
])
|