sudoping01 commited on
Commit
fa84412
Β·
verified Β·
1 Parent(s): 92275ac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +190 -157
app.py CHANGED
@@ -2,52 +2,63 @@ import gradio as gr
2
  import numpy as np
3
  import os
4
  import spaces
 
5
  from huggingface_hub import login
6
 
7
- # DO NOT import maliba_ai here - it will cause CUDA errors
8
- # from maliba_ai.tts.inference import BambaraTTSInference
9
- # from maliba_ai.config.speakers import Adame, Moussa, Bourama, Modibo, Seydou
10
 
11
  hf_token = os.getenv("HF_TOKEN")
12
  if hf_token:
13
  login(token=hf_token)
14
 
15
- # Global variable to store the TTS instance
16
- tts_instance = None
17
- SPEAKERS = None
 
18
 
19
- def initialize_tts():
20
- """Initialize TTS model and speakers - only called inside GPU context"""
21
- global tts_instance, SPEAKERS
 
22
 
23
- if tts_instance is None:
24
- print("Loading Bambara TTS model...")
25
-
26
- # Import here to avoid CUDA initialization during app startup
27
- from maliba_ai.tts.inference import BambaraTTSInference
28
- from maliba_ai.config.speakers import Adame, Moussa, Bourama, Modibo, Seydou
29
-
30
- tts_instance = BambaraTTSInference()
31
-
32
- SPEAKERS = {
33
- "Adame": Adame,
34
- "Moussa": Moussa,
35
- "Bourama": Bourama,
36
- "Modibo": Modibo,
37
- "Seydou": Seydou
38
- }
39
 
40
- print("Model loaded successfully!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- return tts_instance, SPEAKERS
43
 
44
  def validate_inputs(text, temperature, top_k, top_p, max_tokens):
45
  """Validate user inputs"""
46
  if not text or not text.strip():
47
  return False, "Please enter some Bambara text."
48
 
49
- if not (0.001 <= temperature <= 1):
50
- return False, "Temperature must be between 0.001 and 1"
51
 
52
  if not (1 <= top_k <= 100):
53
  return False, "Top-K must be between 1 and 100"
@@ -59,13 +70,14 @@ def validate_inputs(text, temperature, top_k, top_p, max_tokens):
59
 
60
  @spaces.GPU()
61
  def generate_speech(text, speaker_name, use_advanced, temperature, top_k, top_p, max_tokens):
 
62
 
63
  if not text.strip():
64
  return None, "Please enter some Bambara text."
65
 
66
  try:
67
- # Initialize TTS inside GPU context
68
- tts, speakers = initialize_tts()
69
 
70
  speaker = speakers[speaker_name]
71
 
@@ -95,12 +107,10 @@ def generate_speech(text, speaker_name, use_advanced, temperature, top_k, top_p,
95
  return (sample_rate, waveform), f"βœ… Audio generated successfully"
96
 
97
  except Exception as e:
98
- import traceback
99
- error_msg = f"❌ Error: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
100
- print(error_msg) # Log to console for debugging
101
  return None, f"❌ Error: {str(e)}"
102
 
103
- # Define speaker names for UI (without importing the actual speaker objects)
104
  SPEAKER_NAMES = ["Adame", "Moussa", "Bourama", "Modibo", "Seydou"]
105
 
106
  examples = [
@@ -115,143 +125,166 @@ examples = [
115
  ["Aw ni ce. Seidu bΙ› aw fo wa aw ka yafa a ma, ka da a kan tuma dΙ”w la kow ka can.", "Modibo"],
116
  ]
117
 
118
- # Create Gradio interface
119
- with gr.Blocks(title="Bambara TTS - EXPERIMENTAL", theme=gr.themes.Soft()) as demo:
120
- gr.Markdown("""
121
- # 🎀 Bambara Text-to-Speech ⚠️ EXPERIMENTAL
122
-
123
- Convert Bambara text to speech using AI. This model is currently experimental.
124
-
125
- **Bambara** is spoken by millions of people in Mali and West Africa.
126
-
127
- ⚑ **Note**: The model will load when you first generate speech (may take a moment).
128
- """)
129
 
130
- with gr.Row():
131
- with gr.Column(scale=2):
132
- # Input section
133
- text_input = gr.Textbox(
134
- label="πŸ“ Bambara Text",
135
- placeholder="Type your Bambara text here...",
136
- lines=3,
137
- max_lines=6,
138
- value="Aw ni ce"
139
- )
140
-
141
- speaker_dropdown = gr.Dropdown(
142
- choices=SPEAKER_NAMES,
143
- value="Adame",
144
- label="πŸ—£οΈ Speaker Voice"
145
- )
146
-
147
- generate_btn = gr.Button("🎡 Generate Speech", variant="primary", size="lg")
148
-
149
- with gr.Column(scale=1):
150
- use_advanced = gr.Checkbox(
151
- label="βš™οΈ Use Advanced Settings",
152
- value=False,
153
- info="Enable to customize generation parameters"
154
- )
155
-
156
- with gr.Group(visible=False) as advanced_group:
157
- gr.Markdown("**Advanced Parameters:**")
158
-
159
- temperature = gr.Slider(
160
- minimum=0.1,
161
- maximum=2.0,
162
- value=0.8,
163
- step=0.1,
164
- label="Temperature",
165
- info="Higher = more varied"
166
  )
167
 
168
- top_k = gr.Slider(
169
- minimum=1,
170
- maximum=100,
171
- value=50,
172
- step=5,
173
- label="Top-K"
174
  )
175
 
176
- top_p = gr.Slider(
177
- minimum=0.1,
178
- maximum=1.0,
179
- value=0.9,
180
- step=0.05,
181
- label="Top-P"
 
182
  )
183
 
184
- max_tokens = gr.Slider(
185
- minimum=256,
186
- maximum=4096,
187
- value=2048,
188
- step=256,
189
- label="Max Length"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  )
191
-
192
- gr.Markdown("### πŸ”Š Generated Audio")
193
-
194
- audio_output = gr.Audio(
195
- label="Generated Speech",
196
- type="numpy",
197
- interactive=False
198
- )
199
 
200
- status_output = gr.Textbox(
201
- label="Status",
202
- interactive=False,
203
- show_label=False,
204
- container=False
205
- )
206
-
207
- with gr.Accordion("πŸ“š Try These Examples", open=True):
208
- def load_example(text, speaker):
209
- return text, speaker, False, 0.8, 50, 0.9, 2048
 
 
 
 
 
 
 
 
210
 
211
- gr.Markdown("**Click any example below:**")
212
- example_buttons = []
213
 
214
- for i, (text, speaker) in enumerate(examples):
215
- btn = gr.Button(f"🎯 {text[:30]}{'...' if len(text) > 30 else ''}", size="sm")
216
- btn.click(
217
- fn=lambda t=text, s=speaker: load_example(t, s),
218
- outputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens]
219
- )
220
-
221
- # Information section
222
- with gr.Accordion("ℹ️ About", open=False):
223
- gr.Markdown("""
224
- **⚠️ This is an experimental Bambara TTS model.**
225
 
226
- - The model loads automatically when you first generate speech
227
- - First generation may take longer due to model initialization
228
- - GPU acceleration is used for optimal performance
229
- """)
230
-
231
- def toggle_advanced(use_adv):
232
- return gr.Group(visible=use_adv)
 
 
 
 
 
 
233
 
234
- use_advanced.change(
235
- fn=toggle_advanced,
236
- inputs=[use_advanced],
237
- outputs=[advanced_group]
238
- )
239
 
240
- generate_btn.click(
241
- fn=generate_speech,
242
- inputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens],
243
- outputs=[audio_output, status_output]
 
244
  )
245
 
246
- text_input.submit(
247
- fn=generate_speech,
248
- inputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens],
249
- outputs=[audio_output, status_output]
250
- )
251
 
252
  if __name__ == "__main__":
253
- demo.launch(
254
- server_name="0.0.0.0",
255
- server_port=7860,
256
- share=False
257
- )
 
2
  import numpy as np
3
  import os
4
  import spaces
5
+ import logging
6
  from huggingface_hub import login
7
 
8
+ # Set up logging
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
 
12
  hf_token = os.getenv("HF_TOKEN")
13
  if hf_token:
14
  login(token=hf_token)
15
 
16
+ # Global variables for model and speakers
17
+ tts_model = None
18
+ speakers_dict = None
19
+ model_initialized = False
20
 
21
+ @spaces.GPU()
22
+ def initialize_model():
23
+ """Initialize the TTS model and speakers - called once with GPU context"""
24
+ global tts_model, speakers_dict, model_initialized
25
 
26
+ if not model_initialized:
27
+ logger.info("Initializing Bambara TTS model...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
+ try:
30
+ # Import inside GPU context to avoid CUDA initialization errors
31
+ from maliba_ai.tts.inference import BambaraTTSInference
32
+ from maliba_ai.config.speakers import Adame, Moussa, Bourama, Modibo, Seydou
33
+
34
+ # Initialize model
35
+ tts_model = BambaraTTSInference()
36
+
37
+ # Initialize speakers
38
+ speakers_dict = {
39
+ "Adame": Adame,
40
+ "Moussa": Moussa,
41
+ "Bourama": Bourama,
42
+ "Modibo": Modibo,
43
+ "Seydou": Seydou
44
+ }
45
+
46
+ model_initialized = True
47
+ logger.info("Model initialized successfully!")
48
+
49
+ except Exception as e:
50
+ logger.error(f"Failed to initialize model: {e}")
51
+ raise e
52
 
53
+ return tts_model, speakers_dict
54
 
55
  def validate_inputs(text, temperature, top_k, top_p, max_tokens):
56
  """Validate user inputs"""
57
  if not text or not text.strip():
58
  return False, "Please enter some Bambara text."
59
 
60
+ if not (0.001 <= temperature <= 2.0):
61
+ return False, "Temperature must be between 0.001 and 2.0"
62
 
63
  if not (1 <= top_k <= 100):
64
  return False, "Top-K must be between 1 and 100"
 
70
 
71
  @spaces.GPU()
72
  def generate_speech(text, speaker_name, use_advanced, temperature, top_k, top_p, max_tokens):
73
+ """Generate speech using the pre-loaded model"""
74
 
75
  if not text.strip():
76
  return None, "Please enter some Bambara text."
77
 
78
  try:
79
+ # Get the initialized model and speakers
80
+ tts, speakers = initialize_model()
81
 
82
  speaker = speakers[speaker_name]
83
 
 
107
  return (sample_rate, waveform), f"βœ… Audio generated successfully"
108
 
109
  except Exception as e:
110
+ logger.error(f"Speech generation failed: {e}")
 
 
111
  return None, f"❌ Error: {str(e)}"
112
 
113
+ # Define speaker names for UI
114
  SPEAKER_NAMES = ["Adame", "Moussa", "Bourama", "Modibo", "Seydou"]
115
 
116
  examples = [
 
125
  ["Aw ni ce. Seidu bΙ› aw fo wa aw ka yafa a ma, ka da a kan tuma dΙ”w la kow ka can.", "Modibo"],
126
  ]
127
 
128
+ def build_interface():
129
+ """Build the Gradio interface for Bambara TTS"""
 
 
 
 
 
 
 
 
 
130
 
131
+ with gr.Blocks(title="Bambara TTS - EXPERIMENTAL", theme=gr.themes.Soft()) as demo:
132
+ gr.Markdown("""
133
+ # 🎀 Bambara Text-to-Speech ⚠️ EXPERIMENTAL
134
+
135
+ Convert Bambara text to speech using AI. This model is currently experimental.
136
+
137
+ **Bambara** is spoken by millions of people in Mali and West Africa.
138
+
139
+ ⚑ **Note**: Model loads automatically on first use and stays loaded for optimal performance.
140
+ """)
141
+
142
+ with gr.Row():
143
+ with gr.Column(scale=2):
144
+ # Input section
145
+ text_input = gr.Textbox(
146
+ label="πŸ“ Bambara Text",
147
+ placeholder="Type your Bambara text here...",
148
+ lines=3,
149
+ max_lines=6,
150
+ value="Aw ni ce"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  )
152
 
153
+ speaker_dropdown = gr.Dropdown(
154
+ choices=SPEAKER_NAMES,
155
+ value="Adame",
156
+ label="πŸ—£οΈ Speaker Voice"
 
 
157
  )
158
 
159
+ generate_btn = gr.Button("🎡 Generate Speech", variant="primary", size="lg")
160
+
161
+ with gr.Column(scale=1):
162
+ use_advanced = gr.Checkbox(
163
+ label="βš™οΈ Use Advanced Settings",
164
+ value=False,
165
+ info="Enable to customize generation parameters"
166
  )
167
 
168
+ with gr.Group(visible=False) as advanced_group:
169
+ gr.Markdown("**Advanced Parameters:**")
170
+
171
+ temperature = gr.Slider(
172
+ minimum=0.1,
173
+ maximum=2.0,
174
+ value=0.8,
175
+ step=0.1,
176
+ label="Temperature",
177
+ info="Higher = more varied"
178
+ )
179
+
180
+ top_k = gr.Slider(
181
+ minimum=1,
182
+ maximum=100,
183
+ value=50,
184
+ step=5,
185
+ label="Top-K"
186
+ )
187
+
188
+ top_p = gr.Slider(
189
+ minimum=0.1,
190
+ maximum=1.0,
191
+ value=0.9,
192
+ step=0.05,
193
+ label="Top-P"
194
+ )
195
+
196
+ max_tokens = gr.Slider(
197
+ minimum=256,
198
+ maximum=4096,
199
+ value=2048,
200
+ step=256,
201
+ label="Max Length"
202
+ )
203
+
204
+ gr.Markdown("### πŸ”Š Generated Audio")
205
+
206
+ audio_output = gr.Audio(
207
+ label="Generated Speech",
208
+ type="numpy",
209
+ interactive=False
210
+ )
211
+
212
+ status_output = gr.Textbox(
213
+ label="Status",
214
+ interactive=False,
215
+ show_label=False,
216
+ container=False
217
+ )
218
+
219
+ with gr.Accordion("πŸ“š Try These Examples", open=True):
220
+ def load_example(text, speaker):
221
+ return text, speaker, False, 0.8, 50, 0.9, 2048
222
+
223
+ gr.Markdown("**Click any example below:**")
224
+
225
+ for i, (text, speaker) in enumerate(examples):
226
+ btn = gr.Button(f"🎯 {text[:30]}{'...' if len(text) > 30 else ''}", size="sm")
227
+ btn.click(
228
+ fn=lambda t=text, s=speaker: load_example(t, s),
229
+ outputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens]
230
  )
 
 
 
 
 
 
 
 
231
 
232
+ # Information section
233
+ with gr.Accordion("ℹ️ About", open=False):
234
+ gr.Markdown("""
235
+ **⚠️ This is an experimental Bambara TTS model.**
236
+
237
+ - **Model**: Based on SparkTTS architecture with BiCodec
238
+ - **Languages**: Bambara (bm)
239
+ - **Speakers**: 5 different voice options
240
+ - **Sample Rate**: 16kHz
241
+ - **Architecture**: Neural codec with semantic and global tokens
242
+
243
+ ## πŸš€ How to Use
244
+
245
+ 1. **Enter Text**: Type your Bambara text in the input box
246
+ 2. **Choose Speaker**: Select from 5 available voice options
247
+ 3. **Advanced Settings**: Optionally adjust generation parameters
248
+ 4. **Generate**: Click the generate button to create speech
249
+ """)
250
 
251
+ def toggle_advanced(use_adv):
252
+ return gr.Group(visible=use_adv)
253
 
254
+ use_advanced.change(
255
+ fn=toggle_advanced,
256
+ inputs=[use_advanced],
257
+ outputs=[advanced_group]
258
+ )
 
 
 
 
 
 
259
 
260
+ generate_btn.click(
261
+ fn=generate_speech,
262
+ inputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens],
263
+ outputs=[audio_output, status_output],
264
+ show_progress=True
265
+ )
266
+
267
+ text_input.submit(
268
+ fn=generate_speech,
269
+ inputs=[text_input, speaker_dropdown, use_advanced, temperature, top_k, top_p, max_tokens],
270
+ outputs=[audio_output, status_output],
271
+ show_progress=True
272
+ )
273
 
274
+ return demo
275
+
276
+ def main():
277
+ """Main function to launch the Gradio interface"""
278
+ logger.info("Starting Bambara TTS Gradio interface.")
279
 
280
+ interface = build_interface()
281
+ interface.launch(
282
+ server_name="0.0.0.0",
283
+ server_port=7860,
284
+ share=False
285
  )
286
 
287
+ logger.info("Gradio interface launched successfully.")
 
 
 
 
288
 
289
  if __name__ == "__main__":
290
+ main()