tee342 commited on
Commit
987f28e
Β·
verified Β·
1 Parent(s): 7009896

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -23
app.py CHANGED
@@ -24,6 +24,17 @@ from mutagen.id3 import ID3, TIT2, TPE1, TALB, TYER
24
  from TTS.api import TTS
25
  import pickle
26
 
 
 
 
 
 
 
 
 
 
 
 
27
  # Suppress warnings
28
  warnings.filterwarnings("ignore")
29
 
@@ -266,7 +277,7 @@ def batch_process_audio(files, selected_effects, isolate_vocals, preset_name, ex
266
  except Exception as e:
267
  return None, f"❌ Batch processing failed: {str(e)}"
268
 
269
- # === Whisper Transcription Tab ===
270
  whisper_model = WhisperModel("base")
271
 
272
  def transcribe_audio(audio_path):
@@ -274,7 +285,7 @@ def transcribe_audio(audio_path):
274
  text = " ".join([seg.text for seg in segments])
275
  return text
276
 
277
- # === TTS Voice Generator ===
278
  tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
279
 
280
  def generate_tts(text):
@@ -326,6 +337,31 @@ def mix_tracks(track1, track2, volume_offset=0):
326
  mixed.export(out_path, format="wav")
327
  return out_path
328
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
329
  # === Speaker Diarization ("Who Spoke When?") ===
330
  try:
331
  from pyannote.audio import Pipeline as DiarizationPipeline
@@ -334,31 +370,25 @@ try:
334
  hf_token = os.getenv("HF_TOKEN")
335
  if hf_token:
336
  login(token=hf_token)
337
- else:
338
- print("⚠️ HF_TOKEN not set – speaker diarization disabled")
339
-
340
  diarize_pipeline = DiarizationPipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=hf_token or True)
341
- except ImportError:
342
  diarize_pipeline = None
343
- print("⚠️ PyAnnote not installed – speaker diarization disabled")
344
 
345
  def diarize_and_transcribe(audio_path):
346
  if diarize_pipeline is None:
347
  return "⚠️ Diarization pipeline not loaded – check HF token or install pyannote.audio"
348
 
349
- # Run diarization
350
  audio = AudioSegment.from_file(audio_path)
351
  temp_wav = os.path.join(tempfile.gettempdir(), "diarize.wav")
352
  audio.export(temp_wav, format="wav")
353
 
354
  try:
355
- from pyannote.audio import Pipeline as DiarizationPipeline
356
  diarization = diarize_pipeline(temp_wav)
357
 
358
- # Run transcription
359
  result = whisper.transcribe(temp_wav)
360
-
361
  segments = []
 
362
  for turn, _, speaker in diarization.itertracks(yield_label=True):
363
  text = " ".join([seg["text"] for seg in result["segments"] if seg["start"] >= turn.start and seg["end"] <= turn.end])
364
  segments.append({
@@ -462,6 +492,31 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
462
  description="Convert voice to text and edit it before exporting again."
463
  )
464
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
465
  # --- TTS Voice Generator ===
466
  with gr.Tab("πŸ’¬ TTS Voice Generator"):
467
  gr.Interface(
@@ -472,16 +527,6 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
472
  description="Type anything and turn it into natural-sounding speech."
473
  )
474
 
475
- # --- Speaker Diarization (Who Spoke When?) ===
476
- with gr.Tab("πŸ§β€β™‚οΈ Who Spoke When?"):
477
- gr.Interface(
478
- fn=diarize_and_transcribe,
479
- inputs=gr.Audio(label="Upload Interview/Podcast", type="filepath"),
480
- outputs=gr.JSON(label="Diarized Transcript"),
481
- title="Split By Speaker + Transcribe",
482
- description="Detect speakers and transcribe their speech automatically."
483
- )
484
-
485
  # --- Auto-Save / Resume Sessions ===
486
  session_state = gr.State()
487
 
@@ -517,7 +562,7 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
517
  outputs=[session_data, loaded_audio, loaded_preset, loaded_effects]
518
  )
519
 
520
- # --- Trim Silence Automatically (VAD) ===
521
  with gr.Tab("βœ‚οΈ Trim Silence Automatically"):
522
  gr.Interface(
523
  fn=detect_silence,
@@ -567,7 +612,7 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
567
  ],
568
  outputs=gr.File(label="Mixed Output"),
569
  title="Overlay Two Tracks",
570
- description="Mix or subtract two audio files."
571
  )
572
 
573
  demo.launch()
 
24
  from TTS.api import TTS
25
  import pickle
26
 
27
+ # Try to install OpenVoice from GitHub if not found
28
+ try:
29
+ from openvoice.api import TTS as OpenVoiceTTS, ToneColorConverter
30
+ from openvoice.se_extractor import get_se
31
+ except ImportError:
32
+ print("Installing OpenVoice from GitHub...")
33
+ import subprocess
34
+ subprocess.run(["pip", "install", "git+https://github.com/myshell-ai/OpenVoice.git"])
35
+ from openvoice.api import TTS as OpenVoiceTTS, ToneColorConverter
36
+ from openvoice.se_extractor import get_se
37
+
38
  # Suppress warnings
39
  warnings.filterwarnings("ignore")
40
 
 
277
  except Exception as e:
278
  return None, f"❌ Batch processing failed: {str(e)}"
279
 
280
+ # === Transcribe & Edit Tab ===
281
  whisper_model = WhisperModel("base")
282
 
283
  def transcribe_audio(audio_path):
 
285
  text = " ".join([seg.text for seg in segments])
286
  return text
287
 
288
+ # === TTS Tab ===
289
  tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
290
 
291
  def generate_tts(text):
 
337
  mixed.export(out_path, format="wav")
338
  return out_path
339
 
340
+ # === Voice Cloning / Dubbing Tab ===
341
+ def clone_voice(source_audio, target_audio, text):
342
+ try:
343
+ source_se, _ = get_se(source_audio)
344
+ target_se, _ = get_se(target_audio)
345
+
346
+ # Generate base TTS
347
+ out_path = os.path.join(tempfile.gettempdir(), "cloned_output.wav")
348
+ tts.tts_to_file(text=text, file_path=out_path)
349
+
350
+ # Apply voice conversion
351
+ tone_converter.convert(
352
+ audio_src_path=out_path,
353
+ src_se=source_se,
354
+ tgt_se=target_se,
355
+ output_path=out_path
356
+ )
357
+
358
+ return out_path
359
+ except Exception as e:
360
+ return f"⚠️ Cloning failed: {str(e)}"
361
+
362
+ tone_converter = ToneColorConverter().to("cuda" if torch.cuda.is_available() else "cpu")
363
+ openvoice_tts = OpenVoiceTTS(lang='en')
364
+
365
  # === Speaker Diarization ("Who Spoke When?") ===
366
  try:
367
  from pyannote.audio import Pipeline as DiarizationPipeline
 
370
  hf_token = os.getenv("HF_TOKEN")
371
  if hf_token:
372
  login(token=hf_token)
 
 
 
373
  diarize_pipeline = DiarizationPipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=hf_token or True)
374
+ except Exception as e:
375
  diarize_pipeline = None
376
+ print(f"⚠️ Failed to load diarization: {e}")
377
 
378
  def diarize_and_transcribe(audio_path):
379
  if diarize_pipeline is None:
380
  return "⚠️ Diarization pipeline not loaded – check HF token or install pyannote.audio"
381
 
 
382
  audio = AudioSegment.from_file(audio_path)
383
  temp_wav = os.path.join(tempfile.gettempdir(), "diarize.wav")
384
  audio.export(temp_wav, format="wav")
385
 
386
  try:
 
387
  diarization = diarize_pipeline(temp_wav)
388
 
 
389
  result = whisper.transcribe(temp_wav)
 
390
  segments = []
391
+
392
  for turn, _, speaker in diarization.itertracks(yield_label=True):
393
  text = " ".join([seg["text"] for seg in result["segments"] if seg["start"] >= turn.start and seg["end"] <= turn.end])
394
  segments.append({
 
492
  description="Convert voice to text and edit it before exporting again."
493
  )
494
 
495
+ # --- Voice Cloning (Dubbing) ===
496
+ with gr.Tab("🎭 Voice Cloning (Dubbing)"):
497
+ gr.Interface(
498
+ fn=clone_voice,
499
+ inputs=[
500
+ gr.File(label="Source Voice Clip"),
501
+ gr.File(label="Target Voice Clip"),
502
+ gr.Textbox(label="Text to Clone", lines=5)
503
+ ],
504
+ outputs=gr.Audio(label="Cloned Output", type="filepath"),
505
+ title="Replace One Voice With Another",
506
+ description="Clone voice from source to target speaker using AI"
507
+ )
508
+
509
+ # --- Speaker Diarization (Who Spoke When?) ===
510
+ if diarize_pipeline:
511
+ with gr.Tab("πŸ§β€β™‚οΈ Who Spoke When?"):
512
+ gr.Interface(
513
+ fn=diarize_and_transcribe,
514
+ inputs=gr.Audio(label="Upload Interview/Podcast", type="filepath"),
515
+ outputs=gr.JSON(label="Diarized Transcript"),
516
+ title="Split By Speaker + Transcribe",
517
+ description="Detect speakers and transcribe their speech automatically."
518
+ )
519
+
520
  # --- TTS Voice Generator ===
521
  with gr.Tab("πŸ’¬ TTS Voice Generator"):
522
  gr.Interface(
 
527
  description="Type anything and turn it into natural-sounding speech."
528
  )
529
 
 
 
 
 
 
 
 
 
 
 
530
  # --- Auto-Save / Resume Sessions ===
531
  session_state = gr.State()
532
 
 
562
  outputs=[session_data, loaded_audio, loaded_preset, loaded_effects]
563
  )
564
 
565
+ # --- VAD – Detect & Remove Silence ===
566
  with gr.Tab("βœ‚οΈ Trim Silence Automatically"):
567
  gr.Interface(
568
  fn=detect_silence,
 
612
  ],
613
  outputs=gr.File(label="Mixed Output"),
614
  title="Overlay Two Tracks",
615
+ description="Mix, blend, or subtract two audio files."
616
  )
617
 
618
  demo.launch()