rutsam commited on
Commit
3119dd6
Β·
1 Parent(s): 4f6bed6

push the code

Browse files
Files changed (8) hide show
  1. README.md +1 -1
  2. app.py +65 -0
  3. app_single.py +45 -0
  4. app_upload_model_input.py +48 -0
  5. engine.py +129 -0
  6. nemo_asr.py +22 -0
  7. packages.txt +2 -0
  8. requirements.txt +14 -0
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Kinyarwanda Asr
3
- emoji: πŸ’©
4
  colorFrom: yellow
5
  colorTo: indigo
6
  sdk: gradio
 
1
  ---
2
  title: Kinyarwanda Asr
3
+ emoji: πŸš€
4
  colorFrom: yellow
5
  colorTo: indigo
6
  sdk: gradio
app.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import librosa
3
+ import soundfile as sf
4
+ import torch
5
+ import warnings
6
+ import os
7
+ from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer, Wav2Vec2Model
8
+
9
+ from engine import SpeechToTextEngine
10
+ import wave
11
+ import gradio as gr
12
+ import librosa
13
+ import soundfile as sf
14
+ import warnings
15
+
16
+ from nemo_asr import transcribe
17
+
18
+
19
+ warnings.filterwarnings("ignore")
20
+
21
+ from speechbrain.pretrained import EncoderDecoderASR
22
+
23
+ asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-wav2vec2-commonvoice-rw", savedir="pretrained_models/asr-wav2vec2-commonvoice-rw")
24
+ #asr_model.transcribe_file("speechbrain/asr-wav2vec2-commonvoice-rw/example.mp3")
25
+
26
+ # define speech-to-text function
27
+ def asr_transcript(audio):
28
+
29
+ if audio == None:
30
+ return "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)", "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)"
31
+ text = ""
32
+ data={}
33
+ if audio:
34
+ text_asr = asr_model.transcribe_file(audio.name)
35
+ text_nemo_trasducer = transcribe(audio.name, "stt_rw_conformer_transducer_large")
36
+ with open(audio.name,'rb') as f:
37
+ audio_proper = f.read()
38
+ stt_engine = SpeechToTextEngine()
39
+ all_hot_words = []
40
+ if data:
41
+ all_hot_words = stt_engine.add_hot_words(data)
42
+ if not audio_proper:
43
+ raise InvalidUsage('Audio not provided')
44
+ # Running the transcription
45
+ text_coqui = stt_engine.run(audio_proper)
46
+
47
+ return text_asr.lower() , text_coqui , text_nemo_trasducer
48
+ else:
49
+ return "File not valid"
50
+
51
+ gradio_ui = gr.Interface(
52
+ fn=asr_transcript,
53
+ title="Kinyarwanda Speech Recognition",
54
+ description="Record an audio clip from browser using microphone, and let AI do the hard work of transcribing.",
55
+ article = """
56
+ This demo showcases two pretrained STT models the first model from speechbrain(wave2vec+CTC models)(1,2gb) is 30 times larger compared to the coqui STT (deepspeech model)(45mb).
57
+ """,
58
+ inputs=[gr.inputs.Audio(source="microphone", type="file", optional=False, label="Record from microphone")],
59
+ outputs=[gr.outputs.Textbox(label="Recognized speech from speechbrain model"),
60
+ gr.outputs.Textbox(label="Recognized speech from coqui STT model")
61
+ gr.outputs.Textbox(label="Recognized speech from NVIDIA Conformer transduver large model")]
62
+ # examples = [["sample_1.wav"],["sample_2.wav"]]
63
+ )
64
+
65
+ gradio_ui.launch(enable_queue=True)
app_single.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import librosa
3
+ import soundfile as sf
4
+ import torch
5
+ import warnings
6
+ import os
7
+ from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer, Wav2Vec2Model
8
+
9
+
10
+ warnings.filterwarnings("ignore")
11
+
12
+ from speechbrain.pretrained import EncoderDecoderASR
13
+
14
+ asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-wav2vec2-commonvoice-rw", savedir="pretrained_models/asr-wav2vec2-commonvoice-rw")
15
+ #asr_model.transcribe_file("speechbrain/asr-wav2vec2-commonvoice-rw/example.mp3")
16
+
17
+
18
+
19
+ # define speech-to-text function
20
+ def asr_transcript(audio):
21
+
22
+ if audio == None:
23
+ return "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)", "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)"
24
+ text = ""
25
+
26
+ if audio:
27
+ text = asr_model.transcribe_file(audio.name)
28
+
29
+ return text
30
+ else:
31
+ return "File not valid"
32
+
33
+ gradio_ui = gr.Interface(
34
+ fn=asr_transcript,
35
+ title="Kinyarwanda Speech Recognition",
36
+ description="Upload an audio clip or record from browser using microphone, and let AI do the hard work of transcribing.",
37
+ article = """
38
+ This demo showcases the pretrained model from deepspeech.
39
+ """,
40
+ inputs=[gr.inputs.Audio(source="microphone", type="file", optional=False, label="Record from microphone")],
41
+ outputs=[gr.outputs.Textbox(label="Recognized speech")],
42
+ examples = [["sample_1.wav"],["sample_2.wav"]]
43
+ )
44
+
45
+ gradio_ui.launch(enable_queue=True)
app_upload_model_input.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import librosa
3
+ import soundfile as sf
4
+ import torch
5
+ import warnings
6
+ import os
7
+ from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer, Wav2Vec2Model
8
+
9
+
10
+ warnings.filterwarnings("ignore")
11
+
12
+ from speechbrain.pretrained import EncoderDecoderASR
13
+
14
+ asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-wav2vec2-commonvoice-rw", savedir="pretrained_models/asr-wav2vec2-commonvoice-rw")
15
+ #asr_model.transcribe_file("speechbrain/asr-wav2vec2-commonvoice-rw/example.mp3")
16
+
17
+
18
+
19
+ # define speech-to-text function
20
+ def asr_transcript(audio, audio_microphone, model_params):
21
+
22
+
23
+ audio = audio_microphone if audio_microphone else audio
24
+
25
+ if audio == None and audio_microphone == None:
26
+ return "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)", "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)"
27
+ text = ""
28
+
29
+ if audio:
30
+ text = asr_model.transcribe_file(audio.name)
31
+
32
+ return text
33
+ else:
34
+ return "File not valid"
35
+
36
+ gradio_ui = gr.Interface(
37
+ fn=asr_transcript,
38
+ title="Kinyarwanda Speech Recognition",
39
+ description="Upload an audio clip or record from browser using microphone, and let AI do the hard work of transcribing.",
40
+ article = """
41
+ This demo showcases the pretrained model from deepspeech.
42
+ """,
43
+ inputs=[gr.inputs.Audio(label="Upload Audio File", type="file", optional=True), gr.inputs.Audio(source="microphone", type="file", optional=True, label="Record from microphone"), gr.inputs.Dropdown(choices=["deepspeech","coqui (soon)"], type="value", default="deepspeech", label="Select speech recognition model ", optional=False)],
44
+ outputs=[gr.outputs.Textbox(label="Recognized speech")],
45
+ examples = [["sample_1.wav","sample_1.wav","deepspeech"],["sample_2.wav","sample_2.wav","deepspeech"]]
46
+ )
47
+
48
+ gradio_ui.launch(enable_queue=True)
engine.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import queue
2
+ import wave
3
+ from io import BytesIO
4
+ from pathlib import Path
5
+ import wget
6
+ import ffmpeg
7
+ import numpy as np
8
+ import webrtcvad
9
+ from stt import Metadata
10
+ from stt import Model, version
11
+
12
+
13
+
14
+ def normalize_audio_input(audio):
15
+ output, err = ffmpeg.input('pipe:0').output('pipe:1', f='WAV', acodec='pcm_s16le', ac=1, ar='16k', loglevel='error',
16
+ hide_banner=None).run(input=audio, capture_stdout=True,
17
+ capture_stderr=True)
18
+ if err:
19
+ raise Exception(err)
20
+ return output
21
+
22
+
23
+ class Frame(object):
24
+ """Represents a "frame" of audio data."""
25
+
26
+ def __init__(self, frame_bytes, timestamp, duration):
27
+ self.bytes = frame_bytes
28
+ self.timestamp = timestamp
29
+ self.duration = duration
30
+
31
+
32
+ class SpeechToTextEngine:
33
+ """ Class to perform speech-to-text transcription and related functionality """
34
+
35
+ FORMAT = 8
36
+ SAMPLE_RATE = 16000
37
+ CHANNELS = 1
38
+ BLOCKS_PER_SECOND = 50
39
+
40
+ def __init__(self, scorer='kinyarwanda.scorer') -> None:
41
+ """ Initializing the DeepSpeech model """
42
+ wget.download("https://huggingface.co/mbazaNLP/kinyarwanda-coqui-stt-model/resolve/main/kinyarwanda.scorer")
43
+ wget.download("https://huggingface.co/mbazaNLP/kinyarwanda-coqui-stt-model/resolve/main/kinyarwanda.tflite")
44
+
45
+
46
+ self.model = Model('kinyarwanda.tflite')
47
+ self.model.enableExternalScorer(
48
+ scorer_path=Path(__file__).parents[0].joinpath(scorer).absolute().as_posix())
49
+ self.vad = webrtcvad.Vad(mode=3)
50
+ self.sample_rate = self.SAMPLE_RATE
51
+ self.buffer_queue = queue.Queue()
52
+
53
+ def run(self, audio) -> str:
54
+ """ Receives the audio, normalizes it and is sent to the model to be transcribed. Returns the result of the
55
+ transcribe audio in string format."""
56
+
57
+ normalized_audio = normalize_audio_input(audio)
58
+ audio_streams = BytesIO(normalized_audio)
59
+ with wave.Wave_read(audio_streams) as wav:
60
+ audio_streams = np.frombuffer(wav.readframes(wav.getnframes()), np.int16)
61
+ results = self.model.stt(audio_buffer=audio_streams)
62
+ return results
63
+
64
+ def run_with_metadata(self, audio) -> Metadata:
65
+ normalized_audio = normalize_audio_input(audio)
66
+ audio_streams = BytesIO(normalized_audio)
67
+ with wave.Wave_read(audio_streams) as wav:
68
+ audio_streams = np.frombuffer(wav.readframes(wav.getnframes()), np.int16)
69
+ results = self.model.sttWithMetadata(audio_buffer=audio_streams)
70
+ return results
71
+
72
+ def add_hot_words(self, data) -> list:
73
+ """ Receives data in form of hot-words and boosts, adds them to the language model and return the list of the
74
+ added hot-words """
75
+
76
+ all_hot_words = []
77
+ try:
78
+ print('----------------------------------------------------')
79
+ for hot_word in data:
80
+ # Change all the characters of the hot-word to lower case
81
+ word = hot_word.lower()
82
+
83
+ # Get numeric value of the boost
84
+ boost = float(data.get(hot_word))
85
+
86
+ # Adding the hot-word and its boost to the language model
87
+ self.model.addHotWord(hot_word, boost)
88
+
89
+ # Printing on the prompt the activity
90
+ print(f"`{word}` hot-word with boost `{boost}` was added.")
91
+ all_hot_words.append(word)
92
+ return all_hot_words
93
+ except RuntimeError:
94
+ return []
95
+
96
+ def erase_hot_word(self, hot_words) -> None:
97
+ try:
98
+ for hot_word in hot_words:
99
+ self.model.eraseHotWord(hot_word)
100
+ print(f"`{hot_word}` hot-word is erased.")
101
+ print('----------------------------------------------------')
102
+ except RuntimeError:
103
+ return
104
+
105
+ def clear_hot_words(self) -> str:
106
+ try:
107
+ self.model.clearHotWords()
108
+ return f"All hot-words were erased."
109
+ except RuntimeError:
110
+ return f"No more hot-words are left."
111
+
112
+ def deep_stream(self):
113
+ return self.model.createStream()
114
+
115
+ def frame_generator(self, audio, sample_rate=16000, frame_duration_ms=30):
116
+ """
117
+ Takes the desired frame duration in milliseconds, the PCM data, and
118
+ the sample rate. Yields Frames of the requested duration.
119
+ """
120
+
121
+ # audio = np.frombuffer(audio, np.int16)
122
+ n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
123
+ offset = 0
124
+ timestamp = 0.0
125
+ duration = (float(n) / sample_rate) / 2.0
126
+ while offset + n < len(audio):
127
+ yield Frame(audio[offset:offset + n], timestamp, duration)
128
+ timestamp += duration
129
+ offset += n
nemo_asr.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import aiofiles
2
+ import nemo
3
+ import nemo.collections.asr as nemo_asr
4
+
5
+
6
+ def transcribe(file, modelName="stt_rw_conformer_transducer_large"):
7
+ with aiofiles.open(file.filename, 'wb') as out_file:
8
+ content = file.read() # async read
9
+ out_file.write(content) # async write
10
+ print(out_file.name)
11
+ asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(
12
+ model_name=modelName)
13
+ if not file.name.endswith("wav"):
14
+ sound = AudioSegment.from_mp3(out_file.name)
15
+ sound.export(out_file.name, format="wav")
16
+ files = [out_file.name]
17
+ pac.convert_wav_to_16bit_mono(out_file.name,out_file.name)
18
+ # print("file loaded is **************",file.file)
19
+ for fname, transcription in zip(files, asr_model.transcribe(paths2audio_files=files)):
20
+ print(f"Audio in {fname} was recognized as: {transcription}")
21
+ print(transcription[0])
22
+ return {"text": transcription[0], "filename": file.filename}
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ libsndfile1
2
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ librosa==0.9.1
2
+ soundfile==0.10.3.post1
3
+ torch==1.11.0
4
+ transformers==4.18.0
5
+ speechbrain
6
+ stt
7
+ webrtcvad
8
+ numpy
9
+ ffmpeg-python
10
+ librosa==0.9.1
11
+ soundfile==0.10.3.post1
12
+ wget
13
+ aiofiles
14
+ -e https://github.com/NVIDIA/NeMo.git