BounharAbdelaziz commited on
Commit
b319757
Β·
verified Β·
1 Parent(s): 486b5f4

v0.1: Added access token

Browse files
Files changed (1) hide show
  1. utils.py +225 -222
utils.py CHANGED
@@ -1,223 +1,226 @@
1
- import base64
2
- import os
3
- import gradio as gr
4
- from transformers import pipeline
5
- import numpy as np
6
- import librosa
7
- from datetime import datetime
8
- from datasets import (
9
- load_dataset,
10
- concatenate_datasets,
11
- Dataset,
12
- DatasetDict,
13
- Features,
14
- Value,
15
- Audio,
16
- )
17
-
18
-
19
- # Hugging Face evaluation dataset
20
- HF_DATASET_NAME = "atlasia/Moroccan-STT-Eval-Dataset"
21
-
22
- # Models paths
23
- MODEL_PATHS = {
24
- "NANO": "BounharAbdelaziz/Morocco-Darija-STT-tiny",
25
- "SMALL": "BounharAbdelaziz/Morocco-Darija-STT-small",
26
- "LARGE": "BounharAbdelaziz/Morocco-Darija-STT-large-v1.2",
27
- }
28
-
29
- # ---------------------------------------------------------------------------- #
30
- # ---------------------------------------------------------------------------- #
31
-
32
- def encode_image_to_base64(image_path):
33
- with open(image_path, "rb") as image_file:
34
- encoded_string = base64.b64encode(image_file.read()).decode()
35
- return encoded_string
36
-
37
- # ---------------------------------------------------------------------------- #
38
- # ---------------------------------------------------------------------------- #
39
-
40
- def create_html_image(image_path):
41
- img_base64 = encode_image_to_base64(image_path)
42
- html_string = f"""
43
- <div style="display: flex; justify-content: center; align-items: center; width: 100%; text-align: center;">
44
- <div style="max-width: 800px; margin: auto;">
45
- <img src="data:image/jpeg;base64,{img_base64}"
46
- style="max-width: 75%; height: auto; display: block; margin: 0 auto; margin-top: 50px;"
47
- alt="Displayed Image">
48
- </div>
49
- </div>
50
- """
51
- return html_string
52
-
53
- # ---------------------------------------------------------------------------- #
54
- # ---------------------------------------------------------------------------- #
55
-
56
- def load_or_create_dataset():
57
- try:
58
- dataset = load_dataset(HF_DATASET_NAME)
59
- return dataset
60
- except Exception as e:
61
- print(f"[INFO] Dataset not found or error loading: {e}. Creating a new one.")
62
- features = Features({
63
- "timestamp": Value("string"),
64
- "audio": Audio(sampling_rate=16000),
65
- "model_used": Value("string"),
66
- "transcription": Value("string")
67
- })
68
- dataset = Dataset.from_dict({
69
- "timestamp": [],
70
- "audio": [],
71
- "model_used": [],
72
- "transcription": []
73
- }, features=features)
74
- dataset = DatasetDict({
75
- "train": dataset,
76
- })
77
- return dataset
78
-
79
- # ---------------------------------------------------------------------------- #
80
- # ---------------------------------------------------------------------------- #
81
-
82
- def save_to_hf_dataset(audio_signal, model_choice, transcription):
83
- print("[INFO] Loading dataset...")
84
- try:
85
- dataset = load_dataset(HF_DATASET_NAME)
86
- print("[INFO] Dataset loaded successfully.")
87
- except Exception as e:
88
- print(f"[INFO] Dataset not found or error loading. Creating a new one.")
89
- dataset = DatasetDict({
90
- "train": Dataset.from_dict(
91
- {
92
- "audio": [],
93
- "transcription": [],
94
- "model_used": [],
95
- "timestamp": [],
96
- },
97
- features=Features({
98
- "audio": Audio(sampling_rate=16000),
99
- "transcription": Value("string"),
100
- "model_used": Value("string"),
101
- "timestamp": Value("string"),
102
- })
103
- )
104
- })
105
-
106
- timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
107
- new_entry = {
108
- "audio": [{"array": audio_signal, "sampling_rate": 16000}],
109
- "transcription": [transcription],
110
- "model_used": [model_choice],
111
- "timestamp": [timestamp],
112
- }
113
-
114
- new_dataset = Dataset.from_dict(
115
- new_entry,
116
- features=Features({
117
- "audio": Audio(sampling_rate=16000),
118
- "transcription": Value("string"),
119
- "model_used": Value("string"),
120
- "timestamp": Value("string"),
121
- })
122
- )
123
-
124
- print("[INFO] Adding the new entry to the dataset...")
125
- train_dataset = dataset["train"]
126
- updated_train_dataset = concatenate_datasets([train_dataset, new_dataset])
127
- dataset["train"] = updated_train_dataset
128
-
129
- print("[INFO] Pushing the updated dataset...")
130
- dataset.push_to_hub(HF_DATASET_NAME)
131
-
132
- print("[INFO] Dataset updated and pushed successfully.")
133
-
134
- # ---------------------------------------------------------------------------- #
135
- # ---------------------------------------------------------------------------- #
136
-
137
- def load_model(model_name):
138
- model_id = MODEL_PATHS[model_name.upper()]
139
- return pipeline("automatic-speech-recognition", model=model_id)
140
-
141
- # ---------------------------------------------------------------------------- #
142
- # ---------------------------------------------------------------------------- #
143
-
144
- def process_audio(audio, model_choice, save_data):
145
- pipe = load_model(model_choice)
146
- audio_signal = audio[1]
147
- sample_rate = audio[0]
148
- audio_signal = audio_signal.astype(np.float32)
149
-
150
- if np.abs(audio_signal).max() > 1.0:
151
- audio_signal = audio_signal / 32768.0
152
-
153
- if sample_rate != 16000:
154
- print(f"[INFO] Resampling audio from {sample_rate}Hz to 16000Hz")
155
- audio_signal = librosa.resample(
156
- y=audio_signal,
157
- orig_sr=sample_rate,
158
- target_sr=16000
159
- )
160
-
161
- result = pipe(audio_signal)
162
- transcription = result["text"]
163
-
164
- if save_data:
165
- print(f"[INFO] Saving data to eval dataset...")
166
- save_to_hf_dataset(audio_signal, model_choice, transcription)
167
-
168
- return transcription
169
-
170
- # ---------------------------------------------------------------------------- #
171
- # ---------------------------------------------------------------------------- #
172
-
173
- def create_interface():
174
- with gr.Blocks(css="footer{display:none !important}") as app:
175
- base_path = os.path.dirname(__file__)
176
- local_image_path = os.path.join(base_path, 'logo_image.png')
177
- gr.HTML(create_html_image(local_image_path))
178
-
179
- gr.Markdown("# πŸ‡²πŸ‡¦ πŸš€ Moroccan Fast Speech-to-Text Transcription 😍")
180
-
181
- gr.Markdown("⚠️ **Nota bene**: Make sure to click on **Stop** before hitting the **Transcribe** button")
182
- gr.Markdown("πŸ“Œ The **Large** model should be available soon. Stay tuned!")
183
-
184
- with gr.Row():
185
- model_choice = gr.Dropdown(
186
- choices=["Nano", "Small", "Large"],
187
- value="Small",
188
- label="Select one of the models"
189
- )
190
-
191
- with gr.Row():
192
- audio_input = gr.Audio(
193
- sources=["microphone"],
194
- type="numpy",
195
- label="Record Audio",
196
- )
197
-
198
- with gr.Row():
199
- save_data = gr.Checkbox(
200
- label="Contribute to the evaluation benchmark",
201
- value=True
202
- )
203
-
204
- submit_btn = gr.Button("Transcribe πŸ”₯")
205
- output_text = gr.Textbox(label="Transcription")
206
-
207
- gr.Markdown("""
208
- ### πŸ“„πŸ“Œ Notice to our dearest users πŸ€—
209
- - By transcribing your audio, you’re actively contributing to the development of a benchmark evaluation dataset for Moroccan speech-to-text models.
210
- - Your transcriptions will be logged into a dedicated Hugging Face dataset, playing a crucial role in advancing research and innovation in speech recognition for Moroccan dialects and languages.
211
- - Together, we’re building tools that better understand and serve the unique linguistic landscape of Morocco.
212
- - We count on your **thoughtfulness and responsibility** when using the app. Thank you for your contribution! 🌟
213
- """)
214
-
215
- submit_btn.click(
216
- fn=process_audio,
217
- inputs=[audio_input, model_choice, save_data],
218
- outputs=output_text
219
- )
220
-
221
- gr.Markdown("<br/>")
222
-
 
 
 
223
  return app
 
1
+ import base64
2
+ import os
3
+ import gradio as gr
4
+ from transformers import pipeline
5
+ import numpy as np
6
+ import librosa
7
+ from datetime import datetime
8
+ from datasets import (
9
+ load_dataset,
10
+ concatenate_datasets,
11
+ Dataset,
12
+ DatasetDict,
13
+ Features,
14
+ Value,
15
+ Audio,
16
+ )
17
+
18
+
19
+ # Hugging Face evaluation dataset
20
+ HF_DATASET_NAME = "atlasia/Moroccan-STT-Eval-Dataset"
21
+
22
+ # Models paths
23
+ MODEL_PATHS = {
24
+ "NANO": "BounharAbdelaziz/Morocco-Darija-STT-tiny",
25
+ "SMALL": "BounharAbdelaziz/Morocco-Darija-STT-small",
26
+ "LARGE": "BounharAbdelaziz/Morocco-Darija-STT-large-v1.2",
27
+ }
28
+
29
+ # Access token to models
30
+ STT_MODEL_TOKEN = os.environ.get("STT_MODEL_TOKEN")
31
+
32
+ # ---------------------------------------------------------------------------- #
33
+ # ---------------------------------------------------------------------------- #
34
+
35
+ def encode_image_to_base64(image_path):
36
+ with open(image_path, "rb") as image_file:
37
+ encoded_string = base64.b64encode(image_file.read()).decode()
38
+ return encoded_string
39
+
40
+ # ---------------------------------------------------------------------------- #
41
+ # ---------------------------------------------------------------------------- #
42
+
43
+ def create_html_image(image_path):
44
+ img_base64 = encode_image_to_base64(image_path)
45
+ html_string = f"""
46
+ <div style="display: flex; justify-content: center; align-items: center; width: 100%; text-align: center;">
47
+ <div style="max-width: 800px; margin: auto;">
48
+ <img src="data:image/jpeg;base64,{img_base64}"
49
+ style="max-width: 75%; height: auto; display: block; margin: 0 auto; margin-top: 50px;"
50
+ alt="Displayed Image">
51
+ </div>
52
+ </div>
53
+ """
54
+ return html_string
55
+
56
+ # ---------------------------------------------------------------------------- #
57
+ # ---------------------------------------------------------------------------- #
58
+
59
+ def load_or_create_dataset():
60
+ try:
61
+ dataset = load_dataset(HF_DATASET_NAME)
62
+ return dataset
63
+ except Exception as e:
64
+ print(f"[INFO] Dataset not found or error loading: {e}. Creating a new one.")
65
+ features = Features({
66
+ "timestamp": Value("string"),
67
+ "audio": Audio(sampling_rate=16000),
68
+ "model_used": Value("string"),
69
+ "transcription": Value("string")
70
+ })
71
+ dataset = Dataset.from_dict({
72
+ "timestamp": [],
73
+ "audio": [],
74
+ "model_used": [],
75
+ "transcription": []
76
+ }, features=features)
77
+ dataset = DatasetDict({
78
+ "train": dataset,
79
+ })
80
+ return dataset
81
+
82
+ # ---------------------------------------------------------------------------- #
83
+ # ---------------------------------------------------------------------------- #
84
+
85
+ def save_to_hf_dataset(audio_signal, model_choice, transcription):
86
+ print("[INFO] Loading dataset...")
87
+ try:
88
+ dataset = load_dataset(HF_DATASET_NAME)
89
+ print("[INFO] Dataset loaded successfully.")
90
+ except Exception as e:
91
+ print(f"[INFO] Dataset not found or error loading. Creating a new one.")
92
+ dataset = DatasetDict({
93
+ "train": Dataset.from_dict(
94
+ {
95
+ "audio": [],
96
+ "transcription": [],
97
+ "model_used": [],
98
+ "timestamp": [],
99
+ },
100
+ features=Features({
101
+ "audio": Audio(sampling_rate=16000),
102
+ "transcription": Value("string"),
103
+ "model_used": Value("string"),
104
+ "timestamp": Value("string"),
105
+ })
106
+ )
107
+ })
108
+
109
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
110
+ new_entry = {
111
+ "audio": [{"array": audio_signal, "sampling_rate": 16000}],
112
+ "transcription": [transcription],
113
+ "model_used": [model_choice],
114
+ "timestamp": [timestamp],
115
+ }
116
+
117
+ new_dataset = Dataset.from_dict(
118
+ new_entry,
119
+ features=Features({
120
+ "audio": Audio(sampling_rate=16000),
121
+ "transcription": Value("string"),
122
+ "model_used": Value("string"),
123
+ "timestamp": Value("string"),
124
+ })
125
+ )
126
+
127
+ print("[INFO] Adding the new entry to the dataset...")
128
+ train_dataset = dataset["train"]
129
+ updated_train_dataset = concatenate_datasets([train_dataset, new_dataset])
130
+ dataset["train"] = updated_train_dataset
131
+
132
+ print("[INFO] Pushing the updated dataset...")
133
+ dataset.push_to_hub(HF_DATASET_NAME)
134
+
135
+ print("[INFO] Dataset updated and pushed successfully.")
136
+
137
+ # ---------------------------------------------------------------------------- #
138
+ # ---------------------------------------------------------------------------- #
139
+
140
+ def load_model(model_name):
141
+ model_id = MODEL_PATHS[model_name.upper()]
142
+ return pipeline("automatic-speech-recognition", model=model_id, token=STT_MODEL_TOKEN)
143
+
144
+ # ---------------------------------------------------------------------------- #
145
+ # ---------------------------------------------------------------------------- #
146
+
147
+ def process_audio(audio, model_choice, save_data):
148
+ pipe = load_model(model_choice)
149
+ audio_signal = audio[1]
150
+ sample_rate = audio[0]
151
+ audio_signal = audio_signal.astype(np.float32)
152
+
153
+ if np.abs(audio_signal).max() > 1.0:
154
+ audio_signal = audio_signal / 32768.0
155
+
156
+ if sample_rate != 16000:
157
+ print(f"[INFO] Resampling audio from {sample_rate}Hz to 16000Hz")
158
+ audio_signal = librosa.resample(
159
+ y=audio_signal,
160
+ orig_sr=sample_rate,
161
+ target_sr=16000
162
+ )
163
+
164
+ result = pipe(audio_signal)
165
+ transcription = result["text"]
166
+
167
+ if save_data:
168
+ print(f"[INFO] Saving data to eval dataset...")
169
+ save_to_hf_dataset(audio_signal, model_choice, transcription)
170
+
171
+ return transcription
172
+
173
+ # ---------------------------------------------------------------------------- #
174
+ # ---------------------------------------------------------------------------- #
175
+
176
+ def create_interface():
177
+ with gr.Blocks(css="footer{display:none !important}") as app:
178
+ base_path = os.path.dirname(__file__)
179
+ local_image_path = os.path.join(base_path, 'logo_image.png')
180
+ gr.HTML(create_html_image(local_image_path))
181
+
182
+ gr.Markdown("# πŸ‡²πŸ‡¦ πŸš€ Moroccan Fast Speech-to-Text Transcription 😍")
183
+
184
+ gr.Markdown("⚠️ **Nota bene**: Make sure to click on **Stop** before hitting the **Transcribe** button")
185
+ gr.Markdown("πŸ“Œ The **Large** model should be available soon. Stay tuned!")
186
+
187
+ with gr.Row():
188
+ model_choice = gr.Dropdown(
189
+ choices=["Nano", "Small", "Large"],
190
+ value="Small",
191
+ label="Select one of the models"
192
+ )
193
+
194
+ with gr.Row():
195
+ audio_input = gr.Audio(
196
+ sources=["microphone"],
197
+ type="numpy",
198
+ label="Record Audio",
199
+ )
200
+
201
+ with gr.Row():
202
+ save_data = gr.Checkbox(
203
+ label="Contribute to the evaluation benchmark",
204
+ value=True
205
+ )
206
+
207
+ submit_btn = gr.Button("Transcribe πŸ”₯")
208
+ output_text = gr.Textbox(label="Transcription")
209
+
210
+ gr.Markdown("""
211
+ ### πŸ“„πŸ“Œ Notice to our dearest users πŸ€—
212
+ - By transcribing your audio, you’re actively contributing to the development of a benchmark evaluation dataset for Moroccan speech-to-text models.
213
+ - Your transcriptions will be logged into a dedicated Hugging Face dataset, playing a crucial role in advancing research and innovation in speech recognition for Moroccan dialects and languages.
214
+ - Together, we’re building tools that better understand and serve the unique linguistic landscape of Morocco.
215
+ - We count on your **thoughtfulness and responsibility** when using the app. Thank you for your contribution! 🌟
216
+ """)
217
+
218
+ submit_btn.click(
219
+ fn=process_audio,
220
+ inputs=[audio_input, model_choice, save_data],
221
+ outputs=output_text
222
+ )
223
+
224
+ gr.Markdown("<br/>")
225
+
226
  return app