sudoping01 commited on
Commit
254ef61
·
verified ·
1 Parent(s): 379fa64

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +186 -198
app.py CHANGED
@@ -1,75 +1,59 @@
1
- import os
2
-
3
- os.environ["TORCHDYNAMO_DISABLE"] = "1"
4
- os.environ["TORCH_COMPILE_DISABLE"] = "1"
5
- os.environ["PYTORCH_DISABLE_CUDNN_BENCHMARK"] = "1"
6
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
7
-
8
- import torch
9
  import gradio as gr
10
  import numpy as np
 
11
  import spaces
12
- import logging
13
  from huggingface_hub import login
14
- import threading
15
-
16
 
17
- torch._dynamo.config.disable = True
18
- torch._dynamo.config.suppress_errors = True
19
-
20
-
21
- logging.basicConfig(level=logging.INFO)
22
- logger = logging.getLogger(__name__)
23
 
24
  hf_token = os.getenv("HF_TOKEN")
25
  if hf_token:
26
  login(token=hf_token)
27
 
28
 
29
- tts_model = None
30
- speakers_dict = None
31
- model_initialized = False
32
- model_initialized_lock = threading.Lock()
33
-
34
- @spaces.GPU()
35
- def initialize_model():
36
- """Initialize the TTS model and speakers - called once with GPU context"""
37
- global tts_model, speakers_dict, model_initialized, model_initialized_lock
38
-
39
- with model_initialized_lock :
40
- if not model_initialized:
41
- logger.info("Initializing Bambara TTS model...")
42
-
43
- try:
44
- from maliba_ai.tts.inference import BambaraTTSInference
45
- from maliba_ai.config.speakers import Adame, Moussa, Bourama, Modibo, Seydou
46
-
47
 
48
- tts_model = BambaraTTSInference()
49
-
50
- speakers_dict = {
51
- "Adame": Adame,
52
- "Moussa": Moussa,
53
- "Bourama": Bourama,
54
- "Modibo": Modibo,
55
- "Seydou": Seydou
56
- }
57
-
58
- model_initialized = True
59
- logger.info("Model initialized successfully!")
60
-
61
- except Exception as e:
62
- logger.error(f"Failed to initialize model: {e}")
63
- raise e
64
 
65
- return tts_model, speakers_dict
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
  def validate_inputs(text, temperature, top_k, top_p, max_tokens):
 
68
  if not text or not text.strip():
69
  return False, "Please enter some Bambara text."
70
 
71
- if not (0.001 <= temperature <= 2.0):
72
- return False, "Temperature must be between 0.001 and 2.0"
 
 
 
73
 
74
  if not (1 <= top_k <= 100):
75
  return False, "Top-K must be between 1 and 100"
@@ -77,20 +61,24 @@ def validate_inputs(text, temperature, top_k, top_p, max_tokens):
77
  if not (0.1 <= top_p <= 1.0):
78
  return False, "Top-P must be between 0.1 and 1.0"
79
 
 
 
 
80
  return True, ""
81
 
82
  @spaces.GPU()
83
  def generate_speech(text, speaker_name, use_advanced, temperature, top_k, top_p, max_tokens):
84
- global tts_model, speakers_dict, model_initialized, model_initialized_lock
 
 
 
 
85
  if not text.strip():
86
  return None, "Please enter some Bambara text."
87
 
88
  try:
89
- with model_initialized_lock :
90
- if not model_initialized :
91
- tts, speakers = initialize_model()
92
-
93
- speaker = speakers[speaker_name]
94
 
95
  if use_advanced:
96
  is_valid, error_msg = validate_inputs(text, temperature, top_k, top_p, max_tokens)
@@ -106,6 +94,7 @@ def generate_speech(text, speaker_name, use_advanced, temperature, top_k, top_p,
106
  max_new_audio_tokens=int(max_tokens)
107
  )
108
  else:
 
109
  waveform = tts.generate_speech(
110
  text=text.strip(),
111
  speaker_id=speaker
@@ -118,12 +107,9 @@ def generate_speech(text, speaker_name, use_advanced, temperature, top_k, top_p,
118
  return (sample_rate, waveform), f"✅ Audio generated successfully"
119
 
120
  except Exception as e:
121
- logger.error(f"Speech generation failed: {e}")
122
  return None, f"❌ Error: {str(e)}"
123
 
124
 
125
- SPEAKER_NAMES = ["Adame", "Moussa", "Bourama", "Modibo", "Seydou"]
126
-
127
  examples = [
128
  ["Aw ni ce", "Adame"],
129
  ["Mali bɛna diya kɔsɛbɛ, ka a da a kan baara bɛ ka kɛ.", "Moussa"],
@@ -137,157 +123,159 @@ examples = [
137
 
138
  ]
139
 
140
- def build_interface():
141
- """Build the Gradio interface for Bambara TTS"""
142
-
143
- with gr.Blocks(title="Bambara TTS - EXPERIMENTAL") as demo:
144
- gr.Markdown("""
145
- # 🎤 Bambara Text-to-Speech ⚠️ EXPERIMENTAL
146
-
147
- **Powered by MALIBA-AI**
148
-
149
- Convert Bambara text to speech. This model is currently experimental.
150
-
151
- **Bambara** is spoken by millions of people in Mali and West Africa.
152
- .
153
- """)
154
-
155
 
156
-
157
- with gr.Row():
158
- with gr.Column(scale=2):
159
- text_input = gr.Textbox(
160
- label="📝 Bambara Text",
161
- placeholder="Type your Bambara text here...",
162
- lines=3,
163
- max_lines=10,
164
- value="I ni ce"
165
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
- speaker_dropdown = gr.Dropdown(
168
- choices=SPEAKER_NAMES,
169
- value="Adame",
170
- label="🗣️ Speaker Voice"
 
 
 
171
  )
172
 
173
- generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
 
 
 
 
 
 
174
 
175
- with gr.Column(scale=1):
176
- use_advanced = gr.Checkbox(
177
- label="⚙️ Use Advanced Settings",
178
- value=False,
179
- info="Enable to customize generation parameters"
 
180
  )
181
 
182
- with gr.Group(visible=False) as advanced_group:
183
- gr.Markdown("**Advanced Parameters:**")
184
-
185
- temperature = gr.Slider(
186
- minimum=0.1,
187
- maximum=2.0,
188
- value=0.8,
189
- step=0.1,
190
- label="Temperature",
191
- info="Higher = more varied"
192
- )
193
-
194
- top_k = gr.Slider(
195
- minimum=1,
196
- maximum=100,
197
- value=50,
198
- step=5,
199
- label="Top-K"
200
- )
201
-
202
- top_p = gr.Slider(
203
- minimum=0.1,
204
- maximum=1.0,
205
- value=0.9,
206
- step=0.05,
207
- label="Top-P"
208
- )
209
-
210
- max_tokens = gr.Slider(
211
- minimum=256,
212
- maximum=4096,
213
- value=2048,
214
- step=256,
215
- label="Max Length"
216
- )
217
-
218
- gr.Markdown("### 🔊 Generated Audio")
219
-
220
- audio_output = gr.Audio(
221
- label="Generated Speech",
222
- type="numpy",
223
- interactive=False
224
- )
225
-
226
- status_output = gr.Textbox(
227
- label="Status",
228
- interactive=False,
229
- show_label=False,
230
- container=False
231
- )
232
-
233
- with gr.Accordion("Try These Examples", open=True):
234
- def load_example(text, speaker):
235
- return text, speaker, False, 0.8, 50, 0.9, 2048
236
-
237
- gr.Markdown("**Click any example below:**")
238
-
239
- for i, (text, speaker) in enumerate(examples):
240
- btn = gr.Button(f" {text[:30]}{'...' if len(text) > 30 else ''}", size="sm")
241
- btn.click(
242
- fn=lambda t=text, s=speaker: load_example(t, s),
243
- outputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens]
244
  )
 
 
 
 
 
 
 
 
 
245
 
246
- with gr.Accordion(" About", open=False):
247
- gr.Markdown("""
248
- **⚠️ This is an experimental Bambara TTS model.**
249
- - **Languages**: Bambara (bm)
250
- - **Speakers**: 5 different voice options
251
- - **Sample Rate**: 16kHz
252
- """)
 
 
 
 
253
 
254
- def toggle_advanced(use_adv):
255
- return gr.Group(visible=use_adv)
256
 
257
- use_advanced.change(
258
- fn=toggle_advanced,
259
- inputs=[use_advanced],
260
- outputs=[advanced_group]
261
- )
 
 
 
 
 
 
 
 
 
 
 
 
262
 
263
- generate_btn.click(
264
- fn=generate_speech,
265
- inputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens],
266
- outputs=[audio_output, status_output],
267
- show_progress=True
268
- )
269
 
270
- text_input.submit(
271
- fn=generate_speech,
272
- inputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens],
273
- outputs=[audio_output, status_output],
274
- show_progress=True
275
- )
276
 
277
- return demo
278
-
279
- def main():
280
- """Main function to launch the Gradio interface"""
281
- logger.info("Starting Bambara TTS Gradio interface.")
282
 
283
- interface = build_interface()
284
- interface.launch(
285
- server_name="0.0.0.0",
286
- server_port=7860,
287
- share=False
 
 
 
 
 
 
288
  )
289
 
290
- logger.info("Gradio interface launched successfully.")
 
 
 
 
 
291
 
292
  if __name__ == "__main__":
293
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import numpy as np
3
+ import os
4
  import spaces
5
+ import sys
6
  from huggingface_hub import login
 
 
7
 
 
 
 
 
 
 
8
 
9
  hf_token = os.getenv("HF_TOKEN")
10
  if hf_token:
11
  login(token=hf_token)
12
 
13
 
14
+ try:
15
+ from maliba_ai.tts.inference import BambaraTTSInference
16
+ from maliba_ai.config.speakers import Adame, Moussa, Bourama, Modibo, Seydou
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
+
19
+ print("Loading Bambara TTS model...")
20
+ tts = BambaraTTSInference()
21
+ print("Model loaded successfully!")
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
+ MODEL_LOADED = True
24
+ except Exception as e:
25
+ print(f"Error loading model: {e}")
26
+ MODEL_LOADED = False
27
+ tts = None
28
+
29
+
30
+ if MODEL_LOADED:
31
+ SPEAKERS = {
32
+ "Adame": Adame,
33
+ "Moussa": Moussa,
34
+ "Bourama": Bourama,
35
+ "Modibo": Modibo,
36
+ "Seydou": Seydou
37
+ }
38
+ else:
39
+ SPEAKERS = {
40
+ "Adame": "Adame",
41
+ "Moussa": "Moussa",
42
+ "Bourama": "Bourama",
43
+ "Modibo": "Modibo",
44
+ "Seydou": "Seydou"
45
+ }
46
 
47
  def validate_inputs(text, temperature, top_k, top_p, max_tokens):
48
+ """Validate user inputs"""
49
  if not text or not text.strip():
50
  return False, "Please enter some Bambara text."
51
 
52
+ if len(text.strip()) > 1000:
53
+ return False, "Text is too long. Please use shorter text (max 1000 characters)."
54
+
55
+ if not (0.1 <= temperature <= 2.0):
56
+ return False, "Temperature must be between 0.1 and 2.0"
57
 
58
  if not (1 <= top_k <= 100):
59
  return False, "Top-K must be between 1 and 100"
 
61
  if not (0.1 <= top_p <= 1.0):
62
  return False, "Top-P must be between 0.1 and 1.0"
63
 
64
+ if not (256 <= max_tokens <= 4096):
65
+ return False, "Max tokens must be between 256 and 4096"
66
+
67
  return True, ""
68
 
69
  @spaces.GPU()
70
  def generate_speech(text, speaker_name, use_advanced, temperature, top_k, top_p, max_tokens):
71
+ """Generate speech from Bambara text"""
72
+
73
+ if not MODEL_LOADED:
74
+ return None, "❌ Model not loaded. Please check the logs for errors."
75
+
76
  if not text.strip():
77
  return None, "Please enter some Bambara text."
78
 
79
  try:
80
+
81
+ speaker = SPEAKERS[speaker_name]
 
 
 
82
 
83
  if use_advanced:
84
  is_valid, error_msg = validate_inputs(text, temperature, top_k, top_p, max_tokens)
 
94
  max_new_audio_tokens=int(max_tokens)
95
  )
96
  else:
97
+
98
  waveform = tts.generate_speech(
99
  text=text.strip(),
100
  speaker_id=speaker
 
107
  return (sample_rate, waveform), f"✅ Audio generated successfully"
108
 
109
  except Exception as e:
 
110
  return None, f"❌ Error: {str(e)}"
111
 
112
 
 
 
113
  examples = [
114
  ["Aw ni ce", "Adame"],
115
  ["Mali bɛna diya kɔsɛbɛ, ka a da a kan baara bɛ ka kɛ.", "Moussa"],
 
123
 
124
  ]
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
+ with gr.Blocks(title="Bambara TTS - EXPERIMENTAL", theme=gr.themes.Soft()) as demo:
128
+ gr.Markdown("""
129
+ # 🎤 Bambara Text-to-Speech ⚠️ EXPERIMENTAL
130
+
131
+ Convert Bambara text to speech using AI. This model is currently experimental.
132
+
133
+ **Bambara** is spoken by millions of people in Mali and West Africa.
134
+ """)
135
+
136
+ with gr.Row():
137
+ with gr.Column(scale=2):
138
+ # Input section
139
+ text_input = gr.Textbox(
140
+ label="📝 Bambara Text",
141
+ placeholder="Type your Bambara text here...",
142
+ lines=3,
143
+ max_lines=6,
144
+ value="Aw ni ce"
145
+ )
146
+
147
+ # Speaker selection
148
+ speaker_dropdown = gr.Dropdown(
149
+ choices=list(SPEAKERS.keys()),
150
+ value="Adame",
151
+ label="🗣️ Speaker Voice"
152
+ )
153
+
154
+ # Generation button
155
+ generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
156
+
157
+ with gr.Column(scale=1):
158
+ # Advanced settings toggle
159
+ use_advanced = gr.Checkbox(
160
+ label="⚙️ Use Advanced Settings",
161
+ value=False,
162
+ info="Enable to customize generation parameters"
163
+ )
164
+
165
+ # Advanced settings (hidden by default)
166
+ with gr.Group(visible=False) as advanced_group:
167
+ gr.Markdown("**Advanced Parameters:**")
168
 
169
+ temperature = gr.Slider(
170
+ minimum=0.1,
171
+ maximum=2.0,
172
+ value=0.8,
173
+ step=0.1,
174
+ label="Temperature",
175
+ info="Higher = more varied"
176
  )
177
 
178
+ top_k = gr.Slider(
179
+ minimum=1,
180
+ maximum=100,
181
+ value=50,
182
+ step=5,
183
+ label="Top-K"
184
+ )
185
 
186
+ top_p = gr.Slider(
187
+ minimum=0.1,
188
+ maximum=1.0,
189
+ value=0.9,
190
+ step=0.05,
191
+ label="Top-P"
192
  )
193
 
194
+ max_tokens = gr.Slider(
195
+ minimum=256,
196
+ maximum=4096,
197
+ value=2048,
198
+ step=256,
199
+ label="Max Length"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  )
201
+
202
+ # Output section
203
+ gr.Markdown("### 🔊 Generated Audio")
204
+
205
+ audio_output = gr.Audio(
206
+ label="Generated Speech",
207
+ type="numpy",
208
+ interactive=False
209
+ )
210
 
211
+ status_output = gr.Textbox(
212
+ label="Status",
213
+ interactive=False,
214
+ show_label=False,
215
+ container=False
216
+ )
217
+
218
+ # Examples section
219
+ with gr.Accordion("📚 Try These Examples", open=True):
220
+ def load_example(text, speaker):
221
+ return text, speaker, False, 0.8, 50, 0.9, 2048
222
 
223
+ gr.Markdown("**Click any example below:**")
224
+ example_buttons = []
225
 
226
+ for i, (text, speaker) in enumerate(examples):
227
+ btn = gr.Button(f"🎯 {text[:30]}{'...' if len(text) > 30 else ''}", size="sm")
228
+ btn.click(
229
+ fn=lambda t=text, s=speaker: load_example(t, s),
230
+ outputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens]
231
+ )
232
+
233
+ # Information section
234
+ with gr.Accordion("ℹ️ About", open=False):
235
+ gr.Markdown("""
236
+ **⚠️ This is an experimental Bambara TTS model.**
237
+
238
+ **Common Bambara Phrases:**
239
+ - **Aw ni ce** - Hello (formal)
240
+ - **I ni ce** - Hello (informal)
241
+ - **I ka kene wa?** - How are you?
242
+ - **Aw ni tile** - Good afternoon
243
 
244
+ **Available Speakers:** Adame, Moussa, Bourama, Modibo, Seydou
 
 
 
 
 
245
 
246
+ **Tips:**
247
+ - Start with default settings
248
+ - Use shorter texts for better results
249
+ - Try different speakers for variety
250
+ """)
 
251
 
252
+ # Toggle advanced settings visibility
253
+ def toggle_advanced(use_adv):
254
+ return gr.Group(visible=use_adv)
 
 
255
 
256
+ use_advanced.change(
257
+ fn=toggle_advanced,
258
+ inputs=[use_advanced],
259
+ outputs=[advanced_group]
260
+ )
261
+
262
+ # Wire up the interface
263
+ generate_btn.click(
264
+ fn=generate_speech,
265
+ inputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens],
266
+ outputs=[audio_output, status_output]
267
  )
268
 
269
+ # Auto-generate on Enter key in text input
270
+ text_input.submit(
271
+ fn=generate_speech,
272
+ inputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens],
273
+ outputs=[audio_output, status_output]
274
+ )
275
 
276
  if __name__ == "__main__":
277
+ demo.launch(
278
+ server_name="0.0.0.0",
279
+ server_port=7860,
280
+ share=False
281
+ )