yl4579 commited on
Commit
4972f24
·
verified ·
1 Parent(s): 08848b6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -51
app.py CHANGED
@@ -8,14 +8,65 @@ from pathlib import Path
8
  from huggingface_hub import hf_hub_download
9
  import os
10
  import spaces
 
11
 
12
- # Import the inference module (assuming it's named 'infer.py' based on the notebook)
13
  from infer import DMOInference
14
 
15
- # Global model instance
16
  model = None
 
17
  device = "cuda" if torch.cuda.is_available() else "cpu"
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  def download_models():
20
  """Download models from HuggingFace Hub."""
21
  try:
@@ -68,20 +119,21 @@ def initialize_model():
68
  except Exception as e:
69
  return False, f"Error initializing model: {str(e)}"
70
 
71
- # Initialize model on startup
 
72
  model_loaded, status_message = initialize_model()
 
73
 
74
- @spaces.GPU # ZeroGPU allocates a slice only while this runs
75
  def generate_speech(
76
  prompt_audio,
77
  prompt_text,
78
  target_text,
79
  mode,
80
- # Advanced settings
81
  custom_teacher_steps,
82
  custom_teacher_stopping_time,
83
  custom_student_start_step,
84
- temperature,
85
  verbose
86
  ):
87
  """Generate speech with different configurations."""
@@ -96,6 +148,12 @@ def generate_speech(
96
  return None, "Please enter text to generate!", "", ""
97
 
98
  try:
 
 
 
 
 
 
99
  start_time = time.time()
100
 
101
  # Configure parameters based on mode
@@ -151,7 +209,7 @@ def generate_speech(
151
  # Format metrics
152
  metrics = f"RTF: {rtf:.2f}x ({1/rtf:.2f}x speed) | Processing: {processing_time:.2f}s for {audio_duration:.2f}s audio"
153
 
154
- return output_path, "Success!", metrics, f"Mode: {mode}"
155
 
156
  except Exception as e:
157
  return None, f"Error: {str(e)}", "", ""
@@ -163,7 +221,7 @@ with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS", theme=gr.themes.Soft()) as d
163
 
164
  Generate natural speech in any voice with just a short reference audio!
165
 
166
- **Model Status:** {status_message} | **Device:** {device.upper()}
167
  """)
168
 
169
  with gr.Row():
@@ -176,7 +234,7 @@ with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS", theme=gr.themes.Soft()) as d
176
  )
177
 
178
  prompt_text = gr.Textbox(
179
- label="📝 Reference Text (optional - will auto-transcribe if empty)",
180
  placeholder="The text spoken in the reference audio...",
181
  lines=2
182
  )
@@ -202,7 +260,17 @@ with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS", theme=gr.themes.Soft()) as d
202
 
203
  # Advanced settings (collapsible)
204
  with gr.Accordion("⚙️ Advanced Settings", open=False):
205
- with gr.Row():
 
 
 
 
 
 
 
 
 
 
206
  custom_teacher_steps = gr.Slider(
207
  minimum=0,
208
  maximum=32,
@@ -230,15 +298,6 @@ with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS", theme=gr.themes.Soft()) as d
230
  info="Which student step to start from"
231
  )
232
 
233
- temperature = gr.Slider(
234
- minimum=0.0,
235
- maximum=2.0,
236
- value=0.0,
237
- step=0.1,
238
- label="Duration Temperature",
239
- info="0 = deterministic, >0 = more variation in speech rhythm"
240
- )
241
-
242
  verbose = gr.Checkbox(
243
  value=False,
244
  label="Verbose Output",
@@ -274,10 +333,11 @@ with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS", theme=gr.themes.Soft()) as d
274
  gr.Markdown("""
275
  ### 💡 Quick Tips:
276
 
 
277
  - **Student Only**: Fastest (4 steps), good quality
278
  - **Teacher-Guided**: Best balance (8 steps), recommended
279
  - **High Diversity**: More natural prosody (16 steps)
280
- - **Temperature**: Add randomness to speech rhythm
281
 
282
  ### 📊 Expected RTF (Real-Time Factor):
283
  - Student Only: ~0.05x (20x faster than real-time)
@@ -286,35 +346,30 @@ with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS", theme=gr.themes.Soft()) as d
286
  """)
287
 
288
  # Examples section
289
- gr.Markdown("### 🎯 Examples")
290
 
291
- examples = [
292
- [
293
- None, # Will be replaced with actual audio path
294
- "Some call me nature, others call me mother nature.",
295
- "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring.",
296
- "Teacher-Guided (8 steps)",
297
- 16, 0.07, 1, 0.0, False
298
- ],
299
- [
300
- None, # Will be replaced with actual audio path
301
- "对,这就是我,万人敬仰的太乙真人。",
302
- '突然,身边一阵笑声。我看着他们,意气风发地挺直了胸膛,甩了甩那稍显肉感的双臂,轻笑道:"我身上的肉,是为了掩饰我爆棚的魅力,否则,岂不吓坏了你们呢?"',
303
- "Teacher-Guided (8 steps)",
304
- 16, 0.07, 1, 0.0, False
305
- ],
306
- [
307
- None,
308
- "对,这就是我,万人敬仰的太乙真人。",
309
- '突然,身边一阵笑声。我看着他们,意气风发地挺直了胸膛,甩了甩那稍显肉感的双臂,轻笑道:"我身上的肉,是为了掩饰我爆棚的魅力,否则,岂不吓坏了你们呢?"',
310
- "High Diversity (16 steps)",
311
- 24, 0.3, 2, 0.8, False
312
- ]
313
- ]
314
-
315
- # Note about example audio files
316
  gr.Markdown("""
317
- *Note: Example audio files should be uploaded to the Space. The examples above show the text configurations used in the original notebook.*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  """)
319
 
320
  # Event handler
@@ -325,10 +380,10 @@ with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS", theme=gr.themes.Soft()) as d
325
  prompt_text,
326
  target_text,
327
  mode,
 
328
  custom_teacher_steps,
329
  custom_teacher_stopping_time,
330
  custom_student_start_step,
331
- temperature,
332
  verbose
333
  ],
334
  outputs=[output_audio, status, metrics, info]
@@ -336,17 +391,20 @@ with gr.Blocks(title="DMOSpeech 2 - Zero-Shot TTS", theme=gr.themes.Soft()) as d
336
 
337
  # Update visibility of custom settings based on mode
338
  def update_custom_visibility(mode):
339
- return gr.update(visible=(mode == "Custom"))
 
340
 
341
  mode.change(
342
- lambda x: [gr.update(interactive=(x == "Custom"))] * 3,
343
  inputs=[mode],
344
- outputs=[custom_teacher_steps, custom_teacher_stopping_time, custom_student_start_step]
345
  )
346
 
347
  # Launch the app
348
  if __name__ == "__main__":
349
  if not model_loaded:
350
  print(f"Warning: Model failed to load - {status_message}")
 
 
351
 
352
  demo.launch()
 
8
  from huggingface_hub import hf_hub_download
9
  import os
10
  import spaces
11
+ from transformers import pipeline
12
 
13
+ # Import the inference module
14
  from infer import DMOInference
15
 
16
+ # Global variables
17
  model = None
18
+ asr_pipe = None
19
  device = "cuda" if torch.cuda.is_available() else "cpu"
20
 
21
+ # Initialize ASR pipeline
22
+ def initialize_asr_pipeline(device=device, dtype=None):
23
+ """Initialize the ASR pipeline on startup."""
24
+ global asr_pipe
25
+
26
+ if dtype is None:
27
+ dtype = (
28
+ torch.float16
29
+ if "cuda" in device
30
+ and torch.cuda.is_available()
31
+ and torch.cuda.get_device_properties(device).major >= 7
32
+ and not torch.cuda.get_device_name().endswith("[ZLUDA]")
33
+ else torch.float32
34
+ )
35
+
36
+ print("Initializing ASR pipeline...")
37
+ try:
38
+ asr_pipe = pipeline(
39
+ "automatic-speech-recognition",
40
+ model="openai/whisper-large-v3-turbo",
41
+ torch_dtype=dtype,
42
+ device="cpu" # Keep ASR on CPU to save GPU memory
43
+ )
44
+ print("ASR pipeline initialized successfully")
45
+ except Exception as e:
46
+ print(f"Error initializing ASR pipeline: {e}")
47
+ asr_pipe = None
48
+
49
+ # Transcribe function
50
+ def transcribe(ref_audio, language=None):
51
+ """Transcribe audio using the pre-loaded ASR pipeline."""
52
+ global asr_pipe
53
+
54
+ if asr_pipe is None:
55
+ return "" # Return empty string if ASR is not available
56
+
57
+ try:
58
+ result = asr_pipe(
59
+ ref_audio,
60
+ chunk_length_s=30,
61
+ batch_size=128,
62
+ generate_kwargs={"task": "transcribe", "language": language} if language else {"task": "transcribe"},
63
+ return_timestamps=False,
64
+ )
65
+ return result["text"].strip()
66
+ except Exception as e:
67
+ print(f"Transcription error: {e}")
68
+ return ""
69
+
70
  def download_models():
71
  """Download models from HuggingFace Hub."""
72
  try:
 
119
  except Exception as e:
120
  return False, f"Error initializing model: {str(e)}"
121
 
122
+ # Initialize models on startup
123
+ print("Initializing models...")
124
  model_loaded, status_message = initialize_model()
125
+ initialize_asr_pipeline() # Initialize ASR pipeline
126
 
127
+ @spaces.GPU(duration=120) # Request GPU for up to 120 seconds
128
  def generate_speech(
129
  prompt_audio,
130
  prompt_text,
131
  target_text,
132
  mode,
133
+ temperature,
134
  custom_teacher_steps,
135
  custom_teacher_stopping_time,
136
  custom_student_start_step,
 
137
  verbose
138
  ):
139
  """Generate speech with different configurations."""
 
148
  return None, "Please enter text to generate!", "", ""
149
 
150
  try:
151
+ # Auto-transcribe if prompt_text is empty
152
+ if not prompt_text and prompt_text != "":
153
+ print("Auto-transcribing reference audio...")
154
+ prompt_text = transcribe(prompt_audio)
155
+ print(f"Transcribed: {prompt_text}")
156
+
157
  start_time = time.time()
158
 
159
  # Configure parameters based on mode
 
209
  # Format metrics
210
  metrics = f"RTF: {rtf:.2f}x ({1/rtf:.2f}x speed) | Processing: {processing_time:.2f}s for {audio_duration:.2f}s audio"
211
 
212
+ return output_path, "Success!", metrics, f"Mode: {mode} | Transcribed: {prompt_text[:50]}..." if not prompt_text else f"Mode: {mode}"
213
 
214
  except Exception as e:
215
  return None, f"Error: {str(e)}", "", ""
 
221
 
222
  Generate natural speech in any voice with just a short reference audio!
223
 
224
+ **Model Status:** {status_message} | **Device:** {device.upper()} | **ASR:** {"✅ Ready" if asr_pipe else "❌ Not available"}
225
  """)
226
 
227
  with gr.Row():
 
234
  )
235
 
236
  prompt_text = gr.Textbox(
237
+ label="📝 Reference Text (leave empty for auto-transcription)",
238
  placeholder="The text spoken in the reference audio...",
239
  lines=2
240
  )
 
260
 
261
  # Advanced settings (collapsible)
262
  with gr.Accordion("⚙️ Advanced Settings", open=False):
263
+ temperature = gr.Slider(
264
+ minimum=0.0,
265
+ maximum=2.0,
266
+ value=0.0,
267
+ step=0.1,
268
+ label="Duration Temperature",
269
+ info="0 = deterministic, >0 = more variation in speech rhythm"
270
+ )
271
+
272
+ with gr.Group(visible=False) as custom_settings:
273
+ gr.Markdown("### Custom Mode Settings")
274
  custom_teacher_steps = gr.Slider(
275
  minimum=0,
276
  maximum=32,
 
298
  info="Which student step to start from"
299
  )
300
 
 
 
 
 
 
 
 
 
 
301
  verbose = gr.Checkbox(
302
  value=False,
303
  label="Verbose Output",
 
333
  gr.Markdown("""
334
  ### 💡 Quick Tips:
335
 
336
+ - **Auto-transcription**: Leave reference text empty to auto-transcribe
337
  - **Student Only**: Fastest (4 steps), good quality
338
  - **Teacher-Guided**: Best balance (8 steps), recommended
339
  - **High Diversity**: More natural prosody (16 steps)
340
+ - **Custom Mode**: Fine-tune all parameters
341
 
342
  ### 📊 Expected RTF (Real-Time Factor):
343
  - Student Only: ~0.05x (20x faster than real-time)
 
346
  """)
347
 
348
  # Examples section
349
+ gr.Markdown("### 🎯 Example Configurations")
350
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
  gr.Markdown("""
352
+ <details>
353
+ <summary>English Example</summary>
354
+
355
+ **Reference text:** "Some call me nature, others call me mother nature."
356
+
357
+ **Target text:** "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring."
358
+ </details>
359
+
360
+ <details>
361
+ <summary>Chinese Example</summary>
362
+
363
+ **Reference text:** "对,这就是我,万人敬仰的太乙真人。"
364
+
365
+ **Target text:** "突然,身边一阵笑声。我看着他们,意气风发地挺直了胸膛,甩了甩那稍显肉感的双臂,轻笑道:'我身上的肉,是为了掩饰我爆棚的魅力,否则,岂不吓坏了你们呢?'"
366
+ </details>
367
+
368
+ <details>
369
+ <summary>High Diversity Chinese Example</summary>
370
+
371
+ Same as above but with **Temperature: 0.8** for more natural variation in speech rhythm.
372
+ </details>
373
  """)
374
 
375
  # Event handler
 
380
  prompt_text,
381
  target_text,
382
  mode,
383
+ temperature,
384
  custom_teacher_steps,
385
  custom_teacher_stopping_time,
386
  custom_student_start_step,
 
387
  verbose
388
  ],
389
  outputs=[output_audio, status, metrics, info]
 
391
 
392
  # Update visibility of custom settings based on mode
393
  def update_custom_visibility(mode):
394
+ is_custom = (mode == "Custom")
395
+ return gr.update(visible=is_custom)
396
 
397
  mode.change(
398
+ update_custom_visibility,
399
  inputs=[mode],
400
+ outputs=[custom_settings]
401
  )
402
 
403
  # Launch the app
404
  if __name__ == "__main__":
405
  if not model_loaded:
406
  print(f"Warning: Model failed to load - {status_message}")
407
+ if not asr_pipe:
408
+ print("Warning: ASR pipeline not available - auto-transcription disabled")
409
 
410
  demo.launch()