Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -24,6 +24,17 @@ from mutagen.id3 import ID3, TIT2, TPE1, TALB, TYER
|
|
24 |
from TTS.api import TTS
|
25 |
import pickle
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
# Suppress warnings
|
28 |
warnings.filterwarnings("ignore")
|
29 |
|
@@ -266,7 +277,7 @@ def batch_process_audio(files, selected_effects, isolate_vocals, preset_name, ex
|
|
266 |
except Exception as e:
|
267 |
return None, f"β Batch processing failed: {str(e)}"
|
268 |
|
269 |
-
# ===
|
270 |
whisper_model = WhisperModel("base")
|
271 |
|
272 |
def transcribe_audio(audio_path):
|
@@ -274,7 +285,7 @@ def transcribe_audio(audio_path):
|
|
274 |
text = " ".join([seg.text for seg in segments])
|
275 |
return text
|
276 |
|
277 |
-
# === TTS
|
278 |
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
|
279 |
|
280 |
def generate_tts(text):
|
@@ -326,6 +337,31 @@ def mix_tracks(track1, track2, volume_offset=0):
|
|
326 |
mixed.export(out_path, format="wav")
|
327 |
return out_path
|
328 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
329 |
# === Speaker Diarization ("Who Spoke When?") ===
|
330 |
try:
|
331 |
from pyannote.audio import Pipeline as DiarizationPipeline
|
@@ -334,31 +370,25 @@ try:
|
|
334 |
hf_token = os.getenv("HF_TOKEN")
|
335 |
if hf_token:
|
336 |
login(token=hf_token)
|
337 |
-
else:
|
338 |
-
print("β οΈ HF_TOKEN not set β speaker diarization disabled")
|
339 |
-
|
340 |
diarize_pipeline = DiarizationPipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=hf_token or True)
|
341 |
-
except
|
342 |
diarize_pipeline = None
|
343 |
-
print("β οΈ
|
344 |
|
345 |
def diarize_and_transcribe(audio_path):
|
346 |
if diarize_pipeline is None:
|
347 |
return "β οΈ Diarization pipeline not loaded β check HF token or install pyannote.audio"
|
348 |
|
349 |
-
# Run diarization
|
350 |
audio = AudioSegment.from_file(audio_path)
|
351 |
temp_wav = os.path.join(tempfile.gettempdir(), "diarize.wav")
|
352 |
audio.export(temp_wav, format="wav")
|
353 |
|
354 |
try:
|
355 |
-
from pyannote.audio import Pipeline as DiarizationPipeline
|
356 |
diarization = diarize_pipeline(temp_wav)
|
357 |
|
358 |
-
# Run transcription
|
359 |
result = whisper.transcribe(temp_wav)
|
360 |
-
|
361 |
segments = []
|
|
|
362 |
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
363 |
text = " ".join([seg["text"] for seg in result["segments"] if seg["start"] >= turn.start and seg["end"] <= turn.end])
|
364 |
segments.append({
|
@@ -462,6 +492,31 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
|
|
462 |
description="Convert voice to text and edit it before exporting again."
|
463 |
)
|
464 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
465 |
# --- TTS Voice Generator ===
|
466 |
with gr.Tab("π¬ TTS Voice Generator"):
|
467 |
gr.Interface(
|
@@ -472,16 +527,6 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
|
|
472 |
description="Type anything and turn it into natural-sounding speech."
|
473 |
)
|
474 |
|
475 |
-
# --- Speaker Diarization (Who Spoke When?) ===
|
476 |
-
with gr.Tab("π§ββοΈ Who Spoke When?"):
|
477 |
-
gr.Interface(
|
478 |
-
fn=diarize_and_transcribe,
|
479 |
-
inputs=gr.Audio(label="Upload Interview/Podcast", type="filepath"),
|
480 |
-
outputs=gr.JSON(label="Diarized Transcript"),
|
481 |
-
title="Split By Speaker + Transcribe",
|
482 |
-
description="Detect speakers and transcribe their speech automatically."
|
483 |
-
)
|
484 |
-
|
485 |
# --- Auto-Save / Resume Sessions ===
|
486 |
session_state = gr.State()
|
487 |
|
@@ -517,7 +562,7 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
|
|
517 |
outputs=[session_data, loaded_audio, loaded_preset, loaded_effects]
|
518 |
)
|
519 |
|
520 |
-
# ---
|
521 |
with gr.Tab("βοΈ Trim Silence Automatically"):
|
522 |
gr.Interface(
|
523 |
fn=detect_silence,
|
@@ -567,7 +612,7 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
|
|
567 |
],
|
568 |
outputs=gr.File(label="Mixed Output"),
|
569 |
title="Overlay Two Tracks",
|
570 |
-
description="Mix or subtract two audio files."
|
571 |
)
|
572 |
|
573 |
demo.launch()
|
|
|
24 |
from TTS.api import TTS
|
25 |
import pickle
|
26 |
|
27 |
+
# Try to install OpenVoice from GitHub if not found
|
28 |
+
try:
|
29 |
+
from openvoice.api import TTS as OpenVoiceTTS, ToneColorConverter
|
30 |
+
from openvoice.se_extractor import get_se
|
31 |
+
except ImportError:
|
32 |
+
print("Installing OpenVoice from GitHub...")
|
33 |
+
import subprocess
|
34 |
+
subprocess.run(["pip", "install", "git+https://github.com/myshell-ai/OpenVoice.git"])
|
35 |
+
from openvoice.api import TTS as OpenVoiceTTS, ToneColorConverter
|
36 |
+
from openvoice.se_extractor import get_se
|
37 |
+
|
38 |
# Suppress warnings
|
39 |
warnings.filterwarnings("ignore")
|
40 |
|
|
|
277 |
except Exception as e:
|
278 |
return None, f"β Batch processing failed: {str(e)}"
|
279 |
|
280 |
+
# === Transcribe & Edit Tab ===
|
281 |
whisper_model = WhisperModel("base")
|
282 |
|
283 |
def transcribe_audio(audio_path):
|
|
|
285 |
text = " ".join([seg.text for seg in segments])
|
286 |
return text
|
287 |
|
288 |
+
# === TTS Tab ===
|
289 |
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
|
290 |
|
291 |
def generate_tts(text):
|
|
|
337 |
mixed.export(out_path, format="wav")
|
338 |
return out_path
|
339 |
|
340 |
+
# === Voice Cloning / Dubbing Tab ===
|
341 |
+
def clone_voice(source_audio, target_audio, text):
|
342 |
+
try:
|
343 |
+
source_se, _ = get_se(source_audio)
|
344 |
+
target_se, _ = get_se(target_audio)
|
345 |
+
|
346 |
+
# Generate base TTS
|
347 |
+
out_path = os.path.join(tempfile.gettempdir(), "cloned_output.wav")
|
348 |
+
tts.tts_to_file(text=text, file_path=out_path)
|
349 |
+
|
350 |
+
# Apply voice conversion
|
351 |
+
tone_converter.convert(
|
352 |
+
audio_src_path=out_path,
|
353 |
+
src_se=source_se,
|
354 |
+
tgt_se=target_se,
|
355 |
+
output_path=out_path
|
356 |
+
)
|
357 |
+
|
358 |
+
return out_path
|
359 |
+
except Exception as e:
|
360 |
+
return f"β οΈ Cloning failed: {str(e)}"
|
361 |
+
|
362 |
+
tone_converter = ToneColorConverter().to("cuda" if torch.cuda.is_available() else "cpu")
|
363 |
+
openvoice_tts = OpenVoiceTTS(lang='en')
|
364 |
+
|
365 |
# === Speaker Diarization ("Who Spoke When?") ===
|
366 |
try:
|
367 |
from pyannote.audio import Pipeline as DiarizationPipeline
|
|
|
370 |
hf_token = os.getenv("HF_TOKEN")
|
371 |
if hf_token:
|
372 |
login(token=hf_token)
|
|
|
|
|
|
|
373 |
diarize_pipeline = DiarizationPipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=hf_token or True)
|
374 |
+
except Exception as e:
|
375 |
diarize_pipeline = None
|
376 |
+
print(f"β οΈ Failed to load diarization: {e}")
|
377 |
|
378 |
def diarize_and_transcribe(audio_path):
|
379 |
if diarize_pipeline is None:
|
380 |
return "β οΈ Diarization pipeline not loaded β check HF token or install pyannote.audio"
|
381 |
|
|
|
382 |
audio = AudioSegment.from_file(audio_path)
|
383 |
temp_wav = os.path.join(tempfile.gettempdir(), "diarize.wav")
|
384 |
audio.export(temp_wav, format="wav")
|
385 |
|
386 |
try:
|
|
|
387 |
diarization = diarize_pipeline(temp_wav)
|
388 |
|
|
|
389 |
result = whisper.transcribe(temp_wav)
|
|
|
390 |
segments = []
|
391 |
+
|
392 |
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
393 |
text = " ".join([seg["text"] for seg in result["segments"] if seg["start"] >= turn.start and seg["end"] <= turn.end])
|
394 |
segments.append({
|
|
|
492 |
description="Convert voice to text and edit it before exporting again."
|
493 |
)
|
494 |
|
495 |
+
# --- Voice Cloning (Dubbing) ===
|
496 |
+
with gr.Tab("π Voice Cloning (Dubbing)"):
|
497 |
+
gr.Interface(
|
498 |
+
fn=clone_voice,
|
499 |
+
inputs=[
|
500 |
+
gr.File(label="Source Voice Clip"),
|
501 |
+
gr.File(label="Target Voice Clip"),
|
502 |
+
gr.Textbox(label="Text to Clone", lines=5)
|
503 |
+
],
|
504 |
+
outputs=gr.Audio(label="Cloned Output", type="filepath"),
|
505 |
+
title="Replace One Voice With Another",
|
506 |
+
description="Clone voice from source to target speaker using AI"
|
507 |
+
)
|
508 |
+
|
509 |
+
# --- Speaker Diarization (Who Spoke When?) ===
|
510 |
+
if diarize_pipeline:
|
511 |
+
with gr.Tab("π§ββοΈ Who Spoke When?"):
|
512 |
+
gr.Interface(
|
513 |
+
fn=diarize_and_transcribe,
|
514 |
+
inputs=gr.Audio(label="Upload Interview/Podcast", type="filepath"),
|
515 |
+
outputs=gr.JSON(label="Diarized Transcript"),
|
516 |
+
title="Split By Speaker + Transcribe",
|
517 |
+
description="Detect speakers and transcribe their speech automatically."
|
518 |
+
)
|
519 |
+
|
520 |
# --- TTS Voice Generator ===
|
521 |
with gr.Tab("π¬ TTS Voice Generator"):
|
522 |
gr.Interface(
|
|
|
527 |
description="Type anything and turn it into natural-sounding speech."
|
528 |
)
|
529 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
530 |
# --- Auto-Save / Resume Sessions ===
|
531 |
session_state = gr.State()
|
532 |
|
|
|
562 |
outputs=[session_data, loaded_audio, loaded_preset, loaded_effects]
|
563 |
)
|
564 |
|
565 |
+
# --- VAD β Detect & Remove Silence ===
|
566 |
with gr.Tab("βοΈ Trim Silence Automatically"):
|
567 |
gr.Interface(
|
568 |
fn=detect_silence,
|
|
|
612 |
],
|
613 |
outputs=gr.File(label="Mixed Output"),
|
614 |
title="Overlay Two Tracks",
|
615 |
+
description="Mix, blend, or subtract two audio files."
|
616 |
)
|
617 |
|
618 |
demo.launch()
|