clone

Build error

File size: 3,248 Bytes

import gradio as gr
import torch
from TTS.api import TTS
import os
import spaces
import tempfile

os.environ["COQUI_TOS_AGREED"] = "1"

device = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize TTS model
def load_tts_model():
    return TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

tts = load_tts_model()

# Celebrity voices (example list, you may want to expand or modify this)
celebrity_voices = {
    "Morgan Freeman": "path/to/morgan_freeman_sample.wav",
    "Scarlett Johansson": "path/to/scarlett_johansson_sample.wav",
    "David Attenborough": "path/to/david_attenborough_sample.wav",
}

def tts_generate(text, voice, language):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
        temp_audio_path = temp_audio.name
    
    tts.tts_to_file(
        text=text,
        speaker_wav=celebrity_voices[voice],
        language=language,
        file_path=temp_audio_path
    )
    
    return temp_audio_path
    
@spaces.GPU(enable_queue=True)
def clone_voice(text, audio_file, language):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
        temp_audio_path = temp_audio.name
    
    tts.tts_to_file(
        text=text, 
        speaker_wav=audio_file,
        language=language,
        file_path=temp_audio_path
    )
    
    return temp_audio_path

# Placeholder function for Talking Image tab
def talking_image_placeholder():
    return "Talking Image functionality not implemented yet."

# Define Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Advanced Voice Synthesis")
    
    with gr.Tabs():
        with gr.TabItem("TTS"):
            with gr.Row():
                tts_text = gr.Textbox(label="Text to speak")
                tts_voice = gr.Dropdown(choices=list(celebrity_voices.keys()), label="Celebrity Voice")
                tts_language = gr.Dropdown(["en", "es", "fr", "de", "it"], label="Language", value="en")
            tts_generate_btn = gr.Button("Generate")
            tts_output = gr.Audio(label="Generated Audio")
            
            tts_generate_btn.click(
                tts_generate,
                inputs=[tts_text, tts_voice, tts_language],
                outputs=tts_output
            )
        
        with gr.TabItem("Talking Image"):
            gr.Markdown("Talking Image functionality coming soon!")
        
        with gr.TabItem("Clone Voice"):
            with gr.Row():
                clone_text = gr.Textbox(label="Text to speak")
                clone_audio = gr.Audio(label="Voice reference audio file", type="filepath")
                clone_language = gr.Dropdown(["en", "es", "fr", "de", "it"], label="Language", value="en")
            clone_generate_btn = gr.Button("Generate")
            clone_output = gr.Audio(label="Generated Audio")
            
            clone_generate_btn.click(
                clone_voice,
                inputs=[clone_text, clone_audio, clone_language],
                outputs=clone_output
            )

# Launch the interface
demo.launch()

# Clean up temporary files (this will run after the Gradio server is closed)
for file in os.listdir():
    if file.endswith('.wav') and file.startswith('tmp'):
        os.remove(file)