Spaces:
Runtime error
Runtime error
push the code
Browse files- README.md +1 -1
- app.py +65 -0
- app_single.py +45 -0
- app_upload_model_input.py +48 -0
- engine.py +129 -0
- nemo_asr.py +22 -0
- packages.txt +2 -0
- requirements.txt +14 -0
README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
---
|
2 |
title: Kinyarwanda Asr
|
3 |
-
emoji:
|
4 |
colorFrom: yellow
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
|
|
1 |
---
|
2 |
title: Kinyarwanda Asr
|
3 |
+
emoji: π
|
4 |
colorFrom: yellow
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
app.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import librosa
|
3 |
+
import soundfile as sf
|
4 |
+
import torch
|
5 |
+
import warnings
|
6 |
+
import os
|
7 |
+
from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer, Wav2Vec2Model
|
8 |
+
|
9 |
+
from engine import SpeechToTextEngine
|
10 |
+
import wave
|
11 |
+
import gradio as gr
|
12 |
+
import librosa
|
13 |
+
import soundfile as sf
|
14 |
+
import warnings
|
15 |
+
|
16 |
+
from nemo_asr import transcribe
|
17 |
+
|
18 |
+
|
19 |
+
warnings.filterwarnings("ignore")
|
20 |
+
|
21 |
+
from speechbrain.pretrained import EncoderDecoderASR
|
22 |
+
|
23 |
+
asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-wav2vec2-commonvoice-rw", savedir="pretrained_models/asr-wav2vec2-commonvoice-rw")
|
24 |
+
#asr_model.transcribe_file("speechbrain/asr-wav2vec2-commonvoice-rw/example.mp3")
|
25 |
+
|
26 |
+
# define speech-to-text function
|
27 |
+
def asr_transcript(audio):
|
28 |
+
|
29 |
+
if audio == None:
|
30 |
+
return "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)", "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)"
|
31 |
+
text = ""
|
32 |
+
data={}
|
33 |
+
if audio:
|
34 |
+
text_asr = asr_model.transcribe_file(audio.name)
|
35 |
+
text_nemo_trasducer = transcribe(audio.name, "stt_rw_conformer_transducer_large")
|
36 |
+
with open(audio.name,'rb') as f:
|
37 |
+
audio_proper = f.read()
|
38 |
+
stt_engine = SpeechToTextEngine()
|
39 |
+
all_hot_words = []
|
40 |
+
if data:
|
41 |
+
all_hot_words = stt_engine.add_hot_words(data)
|
42 |
+
if not audio_proper:
|
43 |
+
raise InvalidUsage('Audio not provided')
|
44 |
+
# Running the transcription
|
45 |
+
text_coqui = stt_engine.run(audio_proper)
|
46 |
+
|
47 |
+
return text_asr.lower() , text_coqui , text_nemo_trasducer
|
48 |
+
else:
|
49 |
+
return "File not valid"
|
50 |
+
|
51 |
+
gradio_ui = gr.Interface(
|
52 |
+
fn=asr_transcript,
|
53 |
+
title="Kinyarwanda Speech Recognition",
|
54 |
+
description="Record an audio clip from browser using microphone, and let AI do the hard work of transcribing.",
|
55 |
+
article = """
|
56 |
+
This demo showcases two pretrained STT models the first model from speechbrain(wave2vec+CTC models)(1,2gb) is 30 times larger compared to the coqui STT (deepspeech model)(45mb).
|
57 |
+
""",
|
58 |
+
inputs=[gr.inputs.Audio(source="microphone", type="file", optional=False, label="Record from microphone")],
|
59 |
+
outputs=[gr.outputs.Textbox(label="Recognized speech from speechbrain model"),
|
60 |
+
gr.outputs.Textbox(label="Recognized speech from coqui STT model")
|
61 |
+
gr.outputs.Textbox(label="Recognized speech from NVIDIA Conformer transduver large model")]
|
62 |
+
# examples = [["sample_1.wav"],["sample_2.wav"]]
|
63 |
+
)
|
64 |
+
|
65 |
+
gradio_ui.launch(enable_queue=True)
|
app_single.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import librosa
|
3 |
+
import soundfile as sf
|
4 |
+
import torch
|
5 |
+
import warnings
|
6 |
+
import os
|
7 |
+
from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer, Wav2Vec2Model
|
8 |
+
|
9 |
+
|
10 |
+
warnings.filterwarnings("ignore")
|
11 |
+
|
12 |
+
from speechbrain.pretrained import EncoderDecoderASR
|
13 |
+
|
14 |
+
asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-wav2vec2-commonvoice-rw", savedir="pretrained_models/asr-wav2vec2-commonvoice-rw")
|
15 |
+
#asr_model.transcribe_file("speechbrain/asr-wav2vec2-commonvoice-rw/example.mp3")
|
16 |
+
|
17 |
+
|
18 |
+
|
19 |
+
# define speech-to-text function
|
20 |
+
def asr_transcript(audio):
|
21 |
+
|
22 |
+
if audio == None:
|
23 |
+
return "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)", "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)"
|
24 |
+
text = ""
|
25 |
+
|
26 |
+
if audio:
|
27 |
+
text = asr_model.transcribe_file(audio.name)
|
28 |
+
|
29 |
+
return text
|
30 |
+
else:
|
31 |
+
return "File not valid"
|
32 |
+
|
33 |
+
gradio_ui = gr.Interface(
|
34 |
+
fn=asr_transcript,
|
35 |
+
title="Kinyarwanda Speech Recognition",
|
36 |
+
description="Upload an audio clip or record from browser using microphone, and let AI do the hard work of transcribing.",
|
37 |
+
article = """
|
38 |
+
This demo showcases the pretrained model from deepspeech.
|
39 |
+
""",
|
40 |
+
inputs=[gr.inputs.Audio(source="microphone", type="file", optional=False, label="Record from microphone")],
|
41 |
+
outputs=[gr.outputs.Textbox(label="Recognized speech")],
|
42 |
+
examples = [["sample_1.wav"],["sample_2.wav"]]
|
43 |
+
)
|
44 |
+
|
45 |
+
gradio_ui.launch(enable_queue=True)
|
app_upload_model_input.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import librosa
|
3 |
+
import soundfile as sf
|
4 |
+
import torch
|
5 |
+
import warnings
|
6 |
+
import os
|
7 |
+
from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer, Wav2Vec2Model
|
8 |
+
|
9 |
+
|
10 |
+
warnings.filterwarnings("ignore")
|
11 |
+
|
12 |
+
from speechbrain.pretrained import EncoderDecoderASR
|
13 |
+
|
14 |
+
asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-wav2vec2-commonvoice-rw", savedir="pretrained_models/asr-wav2vec2-commonvoice-rw")
|
15 |
+
#asr_model.transcribe_file("speechbrain/asr-wav2vec2-commonvoice-rw/example.mp3")
|
16 |
+
|
17 |
+
|
18 |
+
|
19 |
+
# define speech-to-text function
|
20 |
+
def asr_transcript(audio, audio_microphone, model_params):
|
21 |
+
|
22 |
+
|
23 |
+
audio = audio_microphone if audio_microphone else audio
|
24 |
+
|
25 |
+
if audio == None and audio_microphone == None:
|
26 |
+
return "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)", "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)"
|
27 |
+
text = ""
|
28 |
+
|
29 |
+
if audio:
|
30 |
+
text = asr_model.transcribe_file(audio.name)
|
31 |
+
|
32 |
+
return text
|
33 |
+
else:
|
34 |
+
return "File not valid"
|
35 |
+
|
36 |
+
gradio_ui = gr.Interface(
|
37 |
+
fn=asr_transcript,
|
38 |
+
title="Kinyarwanda Speech Recognition",
|
39 |
+
description="Upload an audio clip or record from browser using microphone, and let AI do the hard work of transcribing.",
|
40 |
+
article = """
|
41 |
+
This demo showcases the pretrained model from deepspeech.
|
42 |
+
""",
|
43 |
+
inputs=[gr.inputs.Audio(label="Upload Audio File", type="file", optional=True), gr.inputs.Audio(source="microphone", type="file", optional=True, label="Record from microphone"), gr.inputs.Dropdown(choices=["deepspeech","coqui (soon)"], type="value", default="deepspeech", label="Select speech recognition model ", optional=False)],
|
44 |
+
outputs=[gr.outputs.Textbox(label="Recognized speech")],
|
45 |
+
examples = [["sample_1.wav","sample_1.wav","deepspeech"],["sample_2.wav","sample_2.wav","deepspeech"]]
|
46 |
+
)
|
47 |
+
|
48 |
+
gradio_ui.launch(enable_queue=True)
|
engine.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import queue
|
2 |
+
import wave
|
3 |
+
from io import BytesIO
|
4 |
+
from pathlib import Path
|
5 |
+
import wget
|
6 |
+
import ffmpeg
|
7 |
+
import numpy as np
|
8 |
+
import webrtcvad
|
9 |
+
from stt import Metadata
|
10 |
+
from stt import Model, version
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
def normalize_audio_input(audio):
|
15 |
+
output, err = ffmpeg.input('pipe:0').output('pipe:1', f='WAV', acodec='pcm_s16le', ac=1, ar='16k', loglevel='error',
|
16 |
+
hide_banner=None).run(input=audio, capture_stdout=True,
|
17 |
+
capture_stderr=True)
|
18 |
+
if err:
|
19 |
+
raise Exception(err)
|
20 |
+
return output
|
21 |
+
|
22 |
+
|
23 |
+
class Frame(object):
|
24 |
+
"""Represents a "frame" of audio data."""
|
25 |
+
|
26 |
+
def __init__(self, frame_bytes, timestamp, duration):
|
27 |
+
self.bytes = frame_bytes
|
28 |
+
self.timestamp = timestamp
|
29 |
+
self.duration = duration
|
30 |
+
|
31 |
+
|
32 |
+
class SpeechToTextEngine:
|
33 |
+
""" Class to perform speech-to-text transcription and related functionality """
|
34 |
+
|
35 |
+
FORMAT = 8
|
36 |
+
SAMPLE_RATE = 16000
|
37 |
+
CHANNELS = 1
|
38 |
+
BLOCKS_PER_SECOND = 50
|
39 |
+
|
40 |
+
def __init__(self, scorer='kinyarwanda.scorer') -> None:
|
41 |
+
""" Initializing the DeepSpeech model """
|
42 |
+
wget.download("https://huggingface.co/mbazaNLP/kinyarwanda-coqui-stt-model/resolve/main/kinyarwanda.scorer")
|
43 |
+
wget.download("https://huggingface.co/mbazaNLP/kinyarwanda-coqui-stt-model/resolve/main/kinyarwanda.tflite")
|
44 |
+
|
45 |
+
|
46 |
+
self.model = Model('kinyarwanda.tflite')
|
47 |
+
self.model.enableExternalScorer(
|
48 |
+
scorer_path=Path(__file__).parents[0].joinpath(scorer).absolute().as_posix())
|
49 |
+
self.vad = webrtcvad.Vad(mode=3)
|
50 |
+
self.sample_rate = self.SAMPLE_RATE
|
51 |
+
self.buffer_queue = queue.Queue()
|
52 |
+
|
53 |
+
def run(self, audio) -> str:
|
54 |
+
""" Receives the audio, normalizes it and is sent to the model to be transcribed. Returns the result of the
|
55 |
+
transcribe audio in string format."""
|
56 |
+
|
57 |
+
normalized_audio = normalize_audio_input(audio)
|
58 |
+
audio_streams = BytesIO(normalized_audio)
|
59 |
+
with wave.Wave_read(audio_streams) as wav:
|
60 |
+
audio_streams = np.frombuffer(wav.readframes(wav.getnframes()), np.int16)
|
61 |
+
results = self.model.stt(audio_buffer=audio_streams)
|
62 |
+
return results
|
63 |
+
|
64 |
+
def run_with_metadata(self, audio) -> Metadata:
|
65 |
+
normalized_audio = normalize_audio_input(audio)
|
66 |
+
audio_streams = BytesIO(normalized_audio)
|
67 |
+
with wave.Wave_read(audio_streams) as wav:
|
68 |
+
audio_streams = np.frombuffer(wav.readframes(wav.getnframes()), np.int16)
|
69 |
+
results = self.model.sttWithMetadata(audio_buffer=audio_streams)
|
70 |
+
return results
|
71 |
+
|
72 |
+
def add_hot_words(self, data) -> list:
|
73 |
+
""" Receives data in form of hot-words and boosts, adds them to the language model and return the list of the
|
74 |
+
added hot-words """
|
75 |
+
|
76 |
+
all_hot_words = []
|
77 |
+
try:
|
78 |
+
print('----------------------------------------------------')
|
79 |
+
for hot_word in data:
|
80 |
+
# Change all the characters of the hot-word to lower case
|
81 |
+
word = hot_word.lower()
|
82 |
+
|
83 |
+
# Get numeric value of the boost
|
84 |
+
boost = float(data.get(hot_word))
|
85 |
+
|
86 |
+
# Adding the hot-word and its boost to the language model
|
87 |
+
self.model.addHotWord(hot_word, boost)
|
88 |
+
|
89 |
+
# Printing on the prompt the activity
|
90 |
+
print(f"`{word}` hot-word with boost `{boost}` was added.")
|
91 |
+
all_hot_words.append(word)
|
92 |
+
return all_hot_words
|
93 |
+
except RuntimeError:
|
94 |
+
return []
|
95 |
+
|
96 |
+
def erase_hot_word(self, hot_words) -> None:
|
97 |
+
try:
|
98 |
+
for hot_word in hot_words:
|
99 |
+
self.model.eraseHotWord(hot_word)
|
100 |
+
print(f"`{hot_word}` hot-word is erased.")
|
101 |
+
print('----------------------------------------------------')
|
102 |
+
except RuntimeError:
|
103 |
+
return
|
104 |
+
|
105 |
+
def clear_hot_words(self) -> str:
|
106 |
+
try:
|
107 |
+
self.model.clearHotWords()
|
108 |
+
return f"All hot-words were erased."
|
109 |
+
except RuntimeError:
|
110 |
+
return f"No more hot-words are left."
|
111 |
+
|
112 |
+
def deep_stream(self):
|
113 |
+
return self.model.createStream()
|
114 |
+
|
115 |
+
def frame_generator(self, audio, sample_rate=16000, frame_duration_ms=30):
|
116 |
+
"""
|
117 |
+
Takes the desired frame duration in milliseconds, the PCM data, and
|
118 |
+
the sample rate. Yields Frames of the requested duration.
|
119 |
+
"""
|
120 |
+
|
121 |
+
# audio = np.frombuffer(audio, np.int16)
|
122 |
+
n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
|
123 |
+
offset = 0
|
124 |
+
timestamp = 0.0
|
125 |
+
duration = (float(n) / sample_rate) / 2.0
|
126 |
+
while offset + n < len(audio):
|
127 |
+
yield Frame(audio[offset:offset + n], timestamp, duration)
|
128 |
+
timestamp += duration
|
129 |
+
offset += n
|
nemo_asr.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import aiofiles
|
2 |
+
import nemo
|
3 |
+
import nemo.collections.asr as nemo_asr
|
4 |
+
|
5 |
+
|
6 |
+
def transcribe(file, modelName="stt_rw_conformer_transducer_large"):
|
7 |
+
with aiofiles.open(file.filename, 'wb') as out_file:
|
8 |
+
content = file.read() # async read
|
9 |
+
out_file.write(content) # async write
|
10 |
+
print(out_file.name)
|
11 |
+
asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(
|
12 |
+
model_name=modelName)
|
13 |
+
if not file.name.endswith("wav"):
|
14 |
+
sound = AudioSegment.from_mp3(out_file.name)
|
15 |
+
sound.export(out_file.name, format="wav")
|
16 |
+
files = [out_file.name]
|
17 |
+
pac.convert_wav_to_16bit_mono(out_file.name,out_file.name)
|
18 |
+
# print("file loaded is **************",file.file)
|
19 |
+
for fname, transcription in zip(files, asr_model.transcribe(paths2audio_files=files)):
|
20 |
+
print(f"Audio in {fname} was recognized as: {transcription}")
|
21 |
+
print(transcription[0])
|
22 |
+
return {"text": transcription[0], "filename": file.filename}
|
packages.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
libsndfile1
|
2 |
+
ffmpeg
|
requirements.txt
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
librosa==0.9.1
|
2 |
+
soundfile==0.10.3.post1
|
3 |
+
torch==1.11.0
|
4 |
+
transformers==4.18.0
|
5 |
+
speechbrain
|
6 |
+
stt
|
7 |
+
webrtcvad
|
8 |
+
numpy
|
9 |
+
ffmpeg-python
|
10 |
+
librosa==0.9.1
|
11 |
+
soundfile==0.10.3.post1
|
12 |
+
wget
|
13 |
+
aiofiles
|
14 |
+
-e https://github.com/NVIDIA/NeMo.git
|