BounharAbdelaziz commited on
Commit
9f5e5ef
Β·
verified Β·
1 Parent(s): 29b71c2

v0.1: Nano and Small models

Browse files
Files changed (3) hide show
  1. app.py +8 -0
  2. requirements.txt +5 -0
  3. utils.py +223 -0
app.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from utils import create_interface
2
+
3
+ if __name__ == "__main__":
4
+
5
+ # Create the Gradio interface
6
+ app = create_interface()
7
+ # Launch the app
8
+ app.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio==5.9.1
2
+ transformers==4.39.2
3
+ numpy==1.26.4
4
+ librosa==0.10.2.post1
5
+ datasets==2.18.0
utils.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import os
3
+ import gradio as gr
4
+ from transformers import pipeline
5
+ import numpy as np
6
+ import librosa
7
+ from datetime import datetime
8
+ from datasets import (
9
+ load_dataset,
10
+ concatenate_datasets,
11
+ Dataset,
12
+ DatasetDict,
13
+ Features,
14
+ Value,
15
+ Audio,
16
+ )
17
+
18
+
19
+ # Hugging Face evaluation dataset
20
+ HF_DATASET_NAME = "atlasia/Moroccan-STT-Eval-Dataset"
21
+
22
+ # Models paths
23
+ MODEL_PATHS = {
24
+ "NANO": "BounharAbdelaziz/Morocco-Darija-STT-tiny",
25
+ "SMALL": "BounharAbdelaziz/Morocco-Darija-STT-small",
26
+ "LARGE": "BounharAbdelaziz/Morocco-Darija-STT-large-v1.2",
27
+ }
28
+
29
+ # ---------------------------------------------------------------------------- #
30
+ # ---------------------------------------------------------------------------- #
31
+
32
+ def encode_image_to_base64(image_path):
33
+ with open(image_path, "rb") as image_file:
34
+ encoded_string = base64.b64encode(image_file.read()).decode()
35
+ return encoded_string
36
+
37
+ # ---------------------------------------------------------------------------- #
38
+ # ---------------------------------------------------------------------------- #
39
+
40
+ def create_html_image(image_path):
41
+ img_base64 = encode_image_to_base64(image_path)
42
+ html_string = f"""
43
+ <div style="display: flex; justify-content: center; align-items: center; width: 100%; text-align: center;">
44
+ <div style="max-width: 800px; margin: auto;">
45
+ <img src="data:image/jpeg;base64,{img_base64}"
46
+ style="max-width: 75%; height: auto; display: block; margin: 0 auto; margin-top: 50px;"
47
+ alt="Displayed Image">
48
+ </div>
49
+ </div>
50
+ """
51
+ return html_string
52
+
53
+ # ---------------------------------------------------------------------------- #
54
+ # ---------------------------------------------------------------------------- #
55
+
56
+ def load_or_create_dataset():
57
+ try:
58
+ dataset = load_dataset(HF_DATASET_NAME)
59
+ return dataset
60
+ except Exception as e:
61
+ print(f"[INFO] Dataset not found or error loading: {e}. Creating a new one.")
62
+ features = Features({
63
+ "timestamp": Value("string"),
64
+ "audio": Audio(sampling_rate=16000),
65
+ "model_used": Value("string"),
66
+ "transcription": Value("string")
67
+ })
68
+ dataset = Dataset.from_dict({
69
+ "timestamp": [],
70
+ "audio": [],
71
+ "model_used": [],
72
+ "transcription": []
73
+ }, features=features)
74
+ dataset = DatasetDict({
75
+ "train": dataset,
76
+ })
77
+ return dataset
78
+
79
+ # ---------------------------------------------------------------------------- #
80
+ # ---------------------------------------------------------------------------- #
81
+
82
+ def save_to_hf_dataset(audio_signal, model_choice, transcription):
83
+ print("[INFO] Loading dataset...")
84
+ try:
85
+ dataset = load_dataset(HF_DATASET_NAME)
86
+ print("[INFO] Dataset loaded successfully.")
87
+ except Exception as e:
88
+ print(f"[INFO] Dataset not found or error loading. Creating a new one.")
89
+ dataset = DatasetDict({
90
+ "train": Dataset.from_dict(
91
+ {
92
+ "audio": [],
93
+ "transcription": [],
94
+ "model_used": [],
95
+ "timestamp": [],
96
+ },
97
+ features=Features({
98
+ "audio": Audio(sampling_rate=16000),
99
+ "transcription": Value("string"),
100
+ "model_used": Value("string"),
101
+ "timestamp": Value("string"),
102
+ })
103
+ )
104
+ })
105
+
106
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
107
+ new_entry = {
108
+ "audio": [{"array": audio_signal, "sampling_rate": 16000}],
109
+ "transcription": [transcription],
110
+ "model_used": [model_choice],
111
+ "timestamp": [timestamp],
112
+ }
113
+
114
+ new_dataset = Dataset.from_dict(
115
+ new_entry,
116
+ features=Features({
117
+ "audio": Audio(sampling_rate=16000),
118
+ "transcription": Value("string"),
119
+ "model_used": Value("string"),
120
+ "timestamp": Value("string"),
121
+ })
122
+ )
123
+
124
+ print("[INFO] Adding the new entry to the dataset...")
125
+ train_dataset = dataset["train"]
126
+ updated_train_dataset = concatenate_datasets([train_dataset, new_dataset])
127
+ dataset["train"] = updated_train_dataset
128
+
129
+ print("[INFO] Pushing the updated dataset...")
130
+ dataset.push_to_hub(HF_DATASET_NAME)
131
+
132
+ print("[INFO] Dataset updated and pushed successfully.")
133
+
134
+ # ---------------------------------------------------------------------------- #
135
+ # ---------------------------------------------------------------------------- #
136
+
137
+ def load_model(model_name):
138
+ model_id = MODEL_PATHS[model_name.upper()]
139
+ return pipeline("automatic-speech-recognition", model=model_id)
140
+
141
+ # ---------------------------------------------------------------------------- #
142
+ # ---------------------------------------------------------------------------- #
143
+
144
+ def process_audio(audio, model_choice, save_data):
145
+ pipe = load_model(model_choice)
146
+ audio_signal = audio[1]
147
+ sample_rate = audio[0]
148
+ audio_signal = audio_signal.astype(np.float32)
149
+
150
+ if np.abs(audio_signal).max() > 1.0:
151
+ audio_signal = audio_signal / 32768.0
152
+
153
+ if sample_rate != 16000:
154
+ print(f"[INFO] Resampling audio from {sample_rate}Hz to 16000Hz")
155
+ audio_signal = librosa.resample(
156
+ y=audio_signal,
157
+ orig_sr=sample_rate,
158
+ target_sr=16000
159
+ )
160
+
161
+ result = pipe(audio_signal)
162
+ transcription = result["text"]
163
+
164
+ if save_data:
165
+ print(f"[INFO] Saving data to eval dataset...")
166
+ save_to_hf_dataset(audio_signal, model_choice, transcription)
167
+
168
+ return transcription
169
+
170
+ # ---------------------------------------------------------------------------- #
171
+ # ---------------------------------------------------------------------------- #
172
+
173
+ def create_interface():
174
+ with gr.Blocks(css="footer{display:none !important}") as app:
175
+ base_path = os.path.dirname(__file__)
176
+ local_image_path = os.path.join(base_path, 'logo_image.png')
177
+ gr.HTML(create_html_image(local_image_path))
178
+
179
+ gr.Markdown("# πŸ‡²πŸ‡¦ πŸš€ Moroccan Fast Speech-to-Text Transcription 😍")
180
+
181
+ gr.Markdown("⚠️ **Nota bene**: Make sure to click on **Stop** before hitting the **Transcribe** button")
182
+ gr.Markdown("πŸ“Œ The **Large** model should be available soon. Stay tuned!")
183
+
184
+ with gr.Row():
185
+ model_choice = gr.Dropdown(
186
+ choices=["Nano", "Small", "Large"],
187
+ value="Small",
188
+ label="Select one of the models"
189
+ )
190
+
191
+ with gr.Row():
192
+ audio_input = gr.Audio(
193
+ sources=["microphone"],
194
+ type="numpy",
195
+ label="Record Audio",
196
+ )
197
+
198
+ with gr.Row():
199
+ save_data = gr.Checkbox(
200
+ label="Contribute to the evaluation benchmark",
201
+ value=True
202
+ )
203
+
204
+ submit_btn = gr.Button("Transcribe πŸ”₯")
205
+ output_text = gr.Textbox(label="Transcription")
206
+
207
+ gr.Markdown("""
208
+ ### πŸ“„πŸ“Œ Notice to our dearest users πŸ€—
209
+ - By transcribing your audio, you’re actively contributing to the development of a benchmark evaluation dataset for Moroccan speech-to-text models.
210
+ - Your transcriptions will be logged into a dedicated Hugging Face dataset, playing a crucial role in advancing research and innovation in speech recognition for Moroccan dialects and languages.
211
+ - Together, we’re building tools that better understand and serve the unique linguistic landscape of Morocco.
212
+ - We count on your **thoughtfulness and responsibility** when using the app. Thank you for your contribution! 🌟
213
+ """)
214
+
215
+ submit_btn.click(
216
+ fn=process_audio,
217
+ inputs=[audio_input, model_choice, save_data],
218
+ outputs=output_text
219
+ )
220
+
221
+ gr.Markdown("<br/>")
222
+
223
+ return app