File size: 3,248 Bytes
88f7073
3cdb410
 
 
11023cf
e37c63c
 
3cdb410
 
e37c63c
 
 
 
 
 
 
 
471fe68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e37c63c
471fe68
1a0b3dd
 
471fe68
e37c63c
 
 
 
 
 
 
 
 
 
 
 
471fe68
 
 
 
88f7073
471fe68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88f7073
 
471fe68
88f7073
 
e37c63c
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import gradio as gr
import torch
from TTS.api import TTS
import os
import spaces
import tempfile

os.environ["COQUI_TOS_AGREED"] = "1"

device = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize TTS model
def load_tts_model():
    return TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

tts = load_tts_model()

# Celebrity voices (example list, you may want to expand or modify this)
celebrity_voices = {
    "Morgan Freeman": "path/to/morgan_freeman_sample.wav",
    "Scarlett Johansson": "path/to/scarlett_johansson_sample.wav",
    "David Attenborough": "path/to/david_attenborough_sample.wav",
}

def tts_generate(text, voice, language):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
        temp_audio_path = temp_audio.name
    
    tts.tts_to_file(
        text=text,
        speaker_wav=celebrity_voices[voice],
        language=language,
        file_path=temp_audio_path
    )
    
    return temp_audio_path
    
@spaces.GPU(enable_queue=True)
def clone_voice(text, audio_file, language):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
        temp_audio_path = temp_audio.name
    
    tts.tts_to_file(
        text=text, 
        speaker_wav=audio_file,
        language=language,
        file_path=temp_audio_path
    )
    
    return temp_audio_path

# Placeholder function for Talking Image tab
def talking_image_placeholder():
    return "Talking Image functionality not implemented yet."

# Define Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Advanced Voice Synthesis")
    
    with gr.Tabs():
        with gr.TabItem("TTS"):
            with gr.Row():
                tts_text = gr.Textbox(label="Text to speak")
                tts_voice = gr.Dropdown(choices=list(celebrity_voices.keys()), label="Celebrity Voice")
                tts_language = gr.Dropdown(["en", "es", "fr", "de", "it"], label="Language", value="en")
            tts_generate_btn = gr.Button("Generate")
            tts_output = gr.Audio(label="Generated Audio")
            
            tts_generate_btn.click(
                tts_generate,
                inputs=[tts_text, tts_voice, tts_language],
                outputs=tts_output
            )
        
        with gr.TabItem("Talking Image"):
            gr.Markdown("Talking Image functionality coming soon!")
        
        with gr.TabItem("Clone Voice"):
            with gr.Row():
                clone_text = gr.Textbox(label="Text to speak")
                clone_audio = gr.Audio(label="Voice reference audio file", type="filepath")
                clone_language = gr.Dropdown(["en", "es", "fr", "de", "it"], label="Language", value="en")
            clone_generate_btn = gr.Button("Generate")
            clone_output = gr.Audio(label="Generated Audio")
            
            clone_generate_btn.click(
                clone_voice,
                inputs=[clone_text, clone_audio, clone_language],
                outputs=clone_output
            )

# Launch the interface
demo.launch()

# Clean up temporary files (this will run after the Gradio server is closed)
for file in os.listdir():
    if file.endswith('.wav') and file.startswith('tmp'):
        os.remove(file)