huckiyang commited on
Commit
8a490ef
Β·
1 Parent(s): 63bf391

[demo] 4B fixing

Browse files
README.md CHANGED
@@ -1,14 +1,14 @@
1
  ---
2
- title: Multilingual Voice 4B Demo
3
- emoji: 🌍
4
- colorFrom: purple
5
  colorTo: purple
6
  sdk: gradio
7
  sdk_version: 5.20.1
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
- short_description: voice understanding demo
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Multilingual Scaling
3
+ emoji: 🌐
4
+ colorFrom: indigo
5
  colorTo: purple
6
  sdk: gradio
7
  sdk_version: 5.20.1
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
+ short_description: Voice Understanding Demo
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import sys
4
+ import subprocess
5
+ import gradio as gr
6
+ import numpy as np
7
+ import soundfile as sf
8
+ import librosa
9
+ import torch
10
+ import torch.cuda
11
+ import gc
12
+
13
+ # Check if required packages are installed, if not install them
14
+ try:
15
+ from espnet2.bin.s2t_inference import Speech2Text
16
+ import torchaudio
17
+ # Try importing espnet_model_zoo specifically
18
+ try:
19
+ import espnet_model_zoo
20
+ print("All packages already installed.")
21
+ except ModuleNotFoundError:
22
+ print("Installing espnet_model_zoo. This may take a few minutes...")
23
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "espnet_model_zoo"])
24
+ import espnet_model_zoo
25
+ print("espnet_model_zoo installed successfully.")
26
+ except ModuleNotFoundError as e:
27
+ missing_module = str(e).split("'")[1]
28
+ print(f"Installing missing module: {missing_module}")
29
+
30
+ if missing_module == "espnet2":
31
+ print("Installing ESPnet. This may take a few minutes...")
32
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "espnet"])
33
+ elif missing_module == "torchaudio":
34
+ print("Installing torchaudio. This may take a few minutes...")
35
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "torchaudio"])
36
+
37
+ # Try importing again
38
+ try:
39
+ from espnet2.bin.s2t_inference import Speech2Text
40
+ import torchaudio
41
+ # Also check for espnet_model_zoo
42
+ try:
43
+ import espnet_model_zoo
44
+ except ModuleNotFoundError:
45
+ print("Installing espnet_model_zoo. This may take a few minutes...")
46
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "espnet_model_zoo"])
47
+ import espnet_model_zoo
48
+ print("All required packages installed successfully.")
49
+ except ModuleNotFoundError as e:
50
+ print(f"Failed to install {str(e).split('No module named ')[1]}. Please install manually.")
51
+ raise
52
+
53
+ # Initialize the model with language option
54
+ def load_model():
55
+ # Force garbage collection
56
+ gc.collect()
57
+ torch.cuda.empty_cache()
58
+
59
+ # Set memory-efficient options
60
+ torch.cuda.set_per_process_memory_fraction(0.95) # Use 95% of available memory
61
+
62
+ # Check if CUDA is available
63
+ device = "cuda" if torch.cuda.is_available() else "cpu"
64
+ print(f"Using device: {device}")
65
+
66
+ # For memory efficiency, you could try loading with 8-bit quantization
67
+ # This requires the bitsandbytes library
68
+ # pip install bitsandbytes
69
+
70
+ model = Speech2Text.from_pretrained(
71
+ "espnet/owls_4B_180K",
72
+ task_sym="<asr>",
73
+ beam_size=1,
74
+ device=device
75
+ )
76
+ return model
77
+
78
+ # Load the model at startup with English as default
79
+ print("Loading multilingual model...")
80
+ model = load_model()
81
+ print("Model loaded successfully!")
82
+
83
+ def transcribe_audio(audio_file, language):
84
+ """Process the audio file and return the transcription"""
85
+ if audio_file is None:
86
+ return "Please upload an audio file or record audio."
87
+
88
+ # If audio is a tuple (from microphone recording)
89
+ if isinstance(audio_file, tuple):
90
+ sr, audio_data = audio_file
91
+ # Create a temporary file to save the audio
92
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
93
+ temp_path = temp_audio.name
94
+ sf.write(temp_path, audio_data, sr)
95
+ audio_file = temp_path
96
+
97
+ # Load and resample the audio file to 16kHz
98
+ speech, _ = librosa.load(audio_file, sr=16000)
99
+
100
+ # Update the language symbol if needed
101
+ model.beam_search.hyps = None
102
+ model.beam_search.pre_beam_score_key = None
103
+
104
+ if language != None:
105
+ model.lang_sym = language
106
+
107
+ # Perform ASR
108
+ text, *_ = model(speech)[0]
109
+
110
+ # Clean up temporary file if created
111
+ if isinstance(audio_file, str) and audio_file.startswith(tempfile.gettempdir()):
112
+ os.unlink(audio_file)
113
+
114
+ return text
115
+
116
+ # Function to handle English transcription
117
+ def transcribe_english(audio_file):
118
+ return transcribe_audio(audio_file, "<eng>")
119
+
120
+ # Function to handle Chinese transcription
121
+ def transcribe_chinese(audio_file):
122
+ return transcribe_audio(audio_file, "<zho>")
123
+
124
+ # Function to handle Japanese transcription
125
+ def transcribe_japanese(audio_file):
126
+ return transcribe_audio(audio_file, "<jpn>")
127
+
128
+ # Function to handle Korean transcription
129
+ def transcribe_korean(audio_file):
130
+ return transcribe_audio(audio_file, "<kor>")
131
+
132
+ # Function to handle Thai transcription
133
+ def transcribe_thai(audio_file):
134
+ return transcribe_audio(audio_file, "<tha>")
135
+
136
+ # Function to handle Italian transcription
137
+ def transcribe_italian(audio_file):
138
+ return transcribe_audio(audio_file, "<ita>")
139
+
140
+ # Function to handle German transcription
141
+ def transcribe_german(audio_file):
142
+ return transcribe_audio(audio_file, "<deu>")
143
+
144
+ # Create the Gradio interface with tabs
145
+ demo = gr.Blocks(title="NVIDIA Research Multilingual Demo")
146
+
147
+ with demo:
148
+ gr.Markdown("# NVIDIA Research Multilingual Demo")
149
+ gr.Markdown("Upload or record audio to transcribe up to 150 human languages using the NVIDIA Research (NVR) 9B model. Audio will be automatically resampled to 16kHz.")
150
+
151
+ with gr.Tabs():
152
+ with gr.TabItem("Microphone Recording"):
153
+ language_mic = gr.Radio(
154
+ ["English", "Mandarin", "Japanese", "Korean", "Thai", "Italian", "German"],
155
+ label="Select Language",
156
+ value="English"
157
+ )
158
+
159
+ with gr.Row():
160
+ with gr.Column():
161
+ mic_input = gr.Audio(sources=["microphone"], type="filepath", label="Record Audio")
162
+ mic_button = gr.Button("Transcribe Recording")
163
+ with gr.Column():
164
+ mic_output = gr.Textbox(label="Transcription")
165
+
166
+ def transcribe_mic(audio, lang):
167
+ lang_map = {
168
+ "English": "<eng>",
169
+ "Chinese": "<zho>",
170
+ "Japanese": "<jpn>",
171
+ "Korean": "<kor>",
172
+ "Thai": "<tha>",
173
+ "Italian": "<ita>",
174
+ "German": "<deu>"
175
+ }
176
+ return transcribe_audio(audio, lang_map.get(lang, "<eng>"))
177
+
178
+ mic_button.click(fn=transcribe_mic, inputs=[mic_input, language_mic], outputs=mic_output)
179
+
180
+ with gr.TabItem("English"):
181
+ with gr.Row():
182
+ with gr.Column():
183
+ en_input = gr.Audio(sources=["upload"], type="filepath", label="Upload Audio")
184
+ en_button = gr.Button("Transcribe Speech")
185
+ with gr.Column():
186
+ en_output = gr.Textbox(label="Speech Transcription")
187
+
188
+ # Add example if the file exists
189
+ if os.path.exists("wav_en_sample_48k.wav"):
190
+ gr.Examples(
191
+ examples=[["wav_en_sample_48k.wav"]],
192
+ inputs=en_input
193
+ )
194
+
195
+ en_button.click(fn=transcribe_english, inputs=en_input, outputs=en_output)
196
+
197
+ with gr.TabItem("Mandarin"):
198
+ with gr.Row():
199
+ with gr.Column():
200
+ zh_input = gr.Audio(sources=["upload"], type="filepath", label="Upload Audio")
201
+ zh_button = gr.Button("Transcribe Speech")
202
+ with gr.Column():
203
+ zh_output = gr.Textbox(label="Speech Transcription")
204
+
205
+ # Add example if the file exists
206
+ if os.path.exists("wav_zh_tw_sample_16k.wav"):
207
+ gr.Examples(
208
+ examples=[["wav_zh_tw_sample_16k.wav"]],
209
+ inputs=zh_input
210
+ )
211
+
212
+ zh_button.click(fn=transcribe_chinese, inputs=zh_input, outputs=zh_output)
213
+
214
+ with gr.TabItem("Japanese"):
215
+ with gr.Row():
216
+ with gr.Column():
217
+ jp_input = gr.Audio(sources=["upload"], type="filepath", label="Upload Audio")
218
+ jp_button = gr.Button("Transcribe Speech")
219
+ with gr.Column():
220
+ jp_output = gr.Textbox(label="Speech Transcription")
221
+
222
+ # Add example if the file exists
223
+ if os.path.exists("wav_jp_sample_48k.wav"):
224
+ gr.Examples(
225
+ examples=[["wav_jp_sample_48k.wav"]],
226
+ inputs=jp_input
227
+ )
228
+
229
+ jp_button.click(fn=transcribe_japanese, inputs=jp_input, outputs=jp_output)
230
+
231
+ with gr.TabItem("Korean"):
232
+ with gr.Row():
233
+ with gr.Column():
234
+ kr_input = gr.Audio(sources=["upload"], type="filepath", label="Upload Audio")
235
+ kr_button = gr.Button("Transcribe Speech")
236
+ with gr.Column():
237
+ kr_output = gr.Textbox(label="Speech Transcription")
238
+
239
+ # Add example if the file exists
240
+ if os.path.exists("wav_kr_sample_48k.wav"):
241
+ gr.Examples(
242
+ examples=[["wav_kr_sample_48k.wav"]],
243
+ inputs=kr_input
244
+ )
245
+
246
+ kr_button.click(fn=transcribe_korean, inputs=kr_input, outputs=kr_output)
247
+
248
+ with gr.TabItem("Thai"):
249
+ with gr.Row():
250
+ with gr.Column():
251
+ th_input = gr.Audio(sources=["upload"], type="filepath", label="Upload Audio")
252
+ th_button = gr.Button("Transcribe Speech")
253
+ with gr.Column():
254
+ th_output = gr.Textbox(label="Speech Transcription")
255
+
256
+ # Add example if the file exists
257
+ if os.path.exists("wav_thai_sample.wav"):
258
+ gr.Examples(
259
+ examples=[["wav_thai_sample.wav"]],
260
+ inputs=th_input
261
+ )
262
+
263
+ th_button.click(fn=transcribe_thai, inputs=th_input, outputs=th_output)
264
+
265
+ with gr.TabItem("Italian"):
266
+ with gr.Row():
267
+ with gr.Column():
268
+ it_input = gr.Audio(sources=["upload"], type="filepath", label="Upload Audio")
269
+ it_button = gr.Button("Transcribe Speech")
270
+ with gr.Column():
271
+ it_output = gr.Textbox(label="Speech Transcription")
272
+
273
+ # Add example if the file exists
274
+ if os.path.exists("wav_it_sample.wav"):
275
+ gr.Examples(
276
+ examples=[["wav_it_sample.wav"]],
277
+ inputs=it_input
278
+ )
279
+
280
+ it_button.click(fn=transcribe_italian, inputs=it_input, outputs=it_output)
281
+
282
+ with gr.TabItem("German"):
283
+ with gr.Row():
284
+ with gr.Column():
285
+ de_input = gr.Audio(sources=["upload"], type="filepath", label="Upload Audio")
286
+ de_button = gr.Button("Transcribe Speech")
287
+ with gr.Column():
288
+ de_output = gr.Textbox(label="Speech Transcription")
289
+
290
+ # Add example if the file exists
291
+ if os.path.exists("wav_de_sample.wav"):
292
+ gr.Examples(
293
+ examples=[["wav_de_sample.wav"]],
294
+ inputs=de_input
295
+ )
296
+
297
+ de_button.click(fn=transcribe_german, inputs=de_input, outputs=de_output)
298
+
299
+ # Launch the app with Hugging Face Spaces compatible settings
300
+ if __name__ == "__main__":
301
+ demo.launch(share=False)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ espnet_model_zoo
3
+ espnet
4
+ librosa
5
+ soundfile
6
+ numpy
7
+ torch
8
+ torchaudio
wav_de_sample.wav ADDED
Binary file (114 kB). View file
 
wav_en_sample_48k.wav ADDED
Binary file (629 kB). View file
 
wav_it_sample.wav ADDED
Binary file (129 kB). View file
 
wav_jp_sample_48k.wav ADDED
Binary file (391 kB). View file
 
wav_kr_sample_48k.wav ADDED
Binary file (422 kB). View file
 
wav_thai_sample.wav ADDED
Binary file (227 kB). View file
 
wav_zh_tw_sample_16k.wav ADDED
Binary file (129 kB). View file