sudoping01 commited on
Commit
6f63a5e
Β·
1 Parent(s): d63d0eb

space init commit

Browse files
app.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import spaces
3
+ import torch
4
+ import torchaudio
5
+ import gradio as gr
6
+ import logging
7
+ from whosper import WhosperTranscriber
8
+
9
+
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ if torch.cuda.is_available():
15
+ device = "cuda"
16
+ logger.info("Using CUDA for inference.")
17
+ elif torch.backends.mps.is_available():
18
+ device = "mps"
19
+ logger.info("Using MPS for inference.")
20
+ else:
21
+ device = "cpu"
22
+ logger.info("Using CPU for inference.")
23
+
24
+
25
+ model_id = "sudoping01/maliba-asr-v1"
26
+ transcriber = WhosperTranscriber(model_id=model_id)
27
+ logger.info(f"Transcriber initialized with model: {model_id}")
28
+
29
+ def resample_audio(audio_path, target_sample_rate=16000):
30
+
31
+ """
32
+ Converts the audio file to the target sampling rate (16000 Hz).
33
+
34
+ Args:
35
+ audio_path (str): Path to the audio file.
36
+ target_sample_rate (int): The desired sample rate.
37
+ Returns:
38
+ A tensor containing the resampled audio data and the target sample rate.
39
+ """
40
+ try:
41
+ waveform, original_sample_rate = torchaudio.load(audio_path)
42
+
43
+ if original_sample_rate != target_sample_rate:
44
+ resampler = torchaudio.transforms.Resample(
45
+ orig_freq=original_sample_rate,
46
+ new_freq=target_sample_rate
47
+ )
48
+ waveform = resampler(waveform)
49
+
50
+ return waveform, target_sample_rate
51
+ except Exception as e:
52
+ logger.error(f"Error resampling audio: {e}")
53
+ raise e
54
+
55
+ @spaces.GPU()
56
+ def transcribe_audio(audio_file):
57
+
58
+ """
59
+ Transcribes the provided audio file into Bambara text using Whosper.
60
+
61
+ Args:
62
+ audio_file: The path to the audio file to transcribe.
63
+ Returns:
64
+ A string representing the transcribed Bambara text.
65
+ """
66
+
67
+ if audio_file is None:
68
+ return "Please provide an audio file for transcription."
69
+
70
+ try:
71
+ logger.info(f"Transcribing audio file: {audio_file}")
72
+
73
+
74
+ result = transcriber.transcribe_audio(audio_file)
75
+
76
+ logger.info("Transcription successful.")
77
+ return result
78
+
79
+ except Exception as e:
80
+ logger.error(f"Transcription failed: {e}")
81
+ return f"Error during transcription: {str(e)}"
82
+
83
+ def get_example_files(directory="./examples"):
84
+
85
+ """
86
+ Returns a list of audio files from the examples directory.
87
+
88
+ Args:
89
+ directory (str): The directory to search for audio files.
90
+ Returns:
91
+ list: A list of paths to the audio files.
92
+ """
93
+
94
+ if not os.path.exists(directory):
95
+ logger.warning(f"Examples directory {directory} not found.")
96
+ return []
97
+
98
+
99
+ audio_extensions = ['.wav', '.mp3', '.m4a', '.flac', '.ogg']
100
+ audio_files = []
101
+
102
+ try:
103
+ files = os.listdir(directory)
104
+ for file in files:
105
+ if any(file.lower().endswith(ext) for ext in audio_extensions):
106
+ full_path = os.path.abspath(os.path.join(directory, file))
107
+ audio_files.append(full_path)
108
+
109
+ logger.info(f"Found {len(audio_files)} example audio files.")
110
+ return audio_files[:5]
111
+
112
+ except Exception as e:
113
+ logger.error(f"Error reading examples directory: {e}")
114
+ return []
115
+
116
+ def build_interface():
117
+ """
118
+ Builds the Gradio interface for Bambara speech recognition.
119
+ """
120
+
121
+ example_files = get_example_files()
122
+
123
+ with gr.Blocks(title="Bambara Speech Recognition") as demo:
124
+ gr.Markdown(
125
+ """
126
+ # 🎀 Bambara Automatic Speech Recognition
127
+
128
+ **Powered by MALIBA-AI**
129
+
130
+ Convert Bambara speech to text using our state-of-the-art ASR model. You can either:
131
+ - πŸŽ™οΈ **Record** your voice directly
132
+ - πŸ“ **Upload** an audio file
133
+ - 🎡 **Try** our example audio files
134
+
135
+ ## Supported Audio Formats
136
+ WAV, MP3, M4A, FLAC, OGG
137
+ """
138
+ )
139
+
140
+ with gr.Row():
141
+ with gr.Column():
142
+
143
+ audio_input = gr.Audio(
144
+ label="🎀 Record or Upload Audio",
145
+ type="filepath",
146
+ sources=["microphone", "upload"]
147
+ )
148
+
149
+ transcribe_btn = gr.Button(
150
+ "πŸ”„ Transcribe Audio",
151
+ variant="primary",
152
+ size="lg"
153
+ )
154
+
155
+
156
+ clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
157
+
158
+ with gr.Column():
159
+ output_text = gr.Textbox(
160
+ label="πŸ“ Transcribed Text (Bambara)",
161
+ lines=8,
162
+ placeholder="Your transcribed Bambara text will appear here...",
163
+ interactive=False
164
+ )
165
+
166
+ # Examples section
167
+ if example_files:
168
+ gr.Markdown("## 🎡 Try These Examples")
169
+ gr.Examples(
170
+ examples=[[f] for f in example_files],
171
+ inputs=[audio_input],
172
+ outputs=output_text,
173
+ fn=transcribe_audio,
174
+ cache_examples=False,
175
+ label="Example Audio Files"
176
+ )
177
+
178
+ # Information section
179
+ gr.Markdown(
180
+ """
181
+ ---
182
+
183
+ ## ℹ️ About This Model
184
+
185
+ - **Model:** [sudoping01/maliba-asr-v1](https://huggingface.co/sudoping01/maliba-asr-v1)
186
+ - **Developer:** MALIBA-AI
187
+ - **Language:** Bambara (bm)
188
+ - **Task:** Automatic Speech Recognition (ASR)
189
+ - **Sample Rate:** 16kHz (automatically resampled)
190
+
191
+ ## πŸš€ How to Use
192
+
193
+ 1. **Record Audio:** Click the microphone button and speak in Bambara
194
+ 2. **Upload File:** Click the upload button to select an audio file
195
+ 3. **Transcribe:** Click the "Transcribe Audio" button
196
+ 4. **View Results:** See your transcribed text in Bambara
197
+
198
+ ## πŸ“Š Performance Notes
199
+
200
+ - Best results with clear speech and minimal background noise
201
+ - Supports various audio formats and durations
202
+ - Optimized for Bambara language patterns and phonetics
203
+ """
204
+ )
205
+
206
+
207
+ transcribe_btn.click(
208
+ fn=transcribe_audio,
209
+ inputs=[audio_input],
210
+ outputs=output_text,
211
+ show_progress=True
212
+ )
213
+
214
+ clear_btn.click(
215
+ fn=lambda: (None, ""),
216
+ outputs=[audio_input, output_text]
217
+ )
218
+
219
+
220
+ audio_input.change(
221
+ fn=transcribe_audio,
222
+ inputs=[audio_input],
223
+ outputs=output_text,
224
+ show_progress=True
225
+ )
226
+
227
+ return demo
228
+
229
+ def main():
230
+ """
231
+ Main function to launch the Gradio interface.
232
+ """
233
+ logger.info("Starting Bambara ASR Gradio interface.")
234
+
235
+
236
+ interface = build_interface()
237
+ interface.launch(
238
+ share=False,
239
+ server_name="0.0.0.0",
240
+ server_port=7860
241
+ )
242
+
243
+ logger.info("Gradio interface launched successfully.")
244
+
245
+ if __name__ == "__main__":
246
+ main()
examples/test1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce23883eac43409832416424b84458eadc02a1efb964b36db35e3c54f3cdccee
3
+ size 1295454
examples/test_00.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd7e68bddcc83a5a315fa712c58f6dc07095142ba926d33a04f1bb13f4552d60
3
+ size 154737
examples/test_01.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db1fe475d58ed4c1738df485310e04a12d4176a7ef65224d1ffeaa93e70c7958
3
+ size 1289004
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ torch>=2.0.0
3
+ torchaudio>=2.0.0
4
+ transformers>=4.30.0
5
+ spaces>=0.10.0
6
+ git+https://github.com/sudoping01/whosper.git