File size: 2,513 Bytes
c636952
a416ccf
9581ca3
22fdf85
 
 
3cda040
9581ca3
22fdf85
9581ca3
3cda040
a416ccf
22fdf85
 
d8fe51c
a416ccf
22fdf85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97fb01c
22fdf85
 
a416ccf
 
d8fe51c
a416ccf
3cda040
 
a416ccf
 
c636952
22fdf85
 
 
 
 
 
 
 
 
 
d8fe51c
c636952
a416ccf
d8fe51c
c636952
a416ccf
d8fe51c
22fdf85
 
 
 
 
 
d8fe51c
a416ccf
d8fe51c
a416ccf
d8fe51c
22fdf85
 
 
 
 
 
a416ccf
 
22fdf85
a416ccf
 
c636952
d8fe51c
22fdf85
3cda040
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import gradio as gr
from TTS.api import TTS
import os
import tempfile
import sounddevice as sd
from scipy.io.wavfile import write
from concurrent.futures import ThreadPoolExecutor

# Agree to Coqui's terms
os.environ["COQUI_TOS_AGREED"] = "1"
os.environ["OMP_NUM_THREADS"] = "2"  # Set CPU threads to 8 (adjust based on your CPU cores)

# Load the model and optimize CPU usage
tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
tts.to("cpu")

# Supported languages by the model
LANGUAGES = {
    "English": "en",
    "Spanish": "es",
    "German": "de",
    "French": "fr",
    "Italian": "it",
    "Hindi" : "hi",
    "Russian": "ru", 
    "Turkish": "tr",
    "Japanese": "ja", 
    "Korean": "ko",
    "Hungarian": "hu"
}

# Function to generate voice
def generate_voice(text, speaker_audio, language):
    output_path = tempfile.mktemp(suffix=".wav")
    tts.tts_to_file(
        text=text,
        speaker_wav=speaker_audio,
        file_path=output_path,
        language=LANGUAGES.get(language, "en"),
        sample_rate=44100,  
    )
    return output_path

# Function to record audio from the mic
def record_audio(duration=10, filename="mic_input.wav"):
    fs = 44100  # Sample rate
    print("Recording...")
    audio_data = sd.rec(int(duration * fs), samplerate=fs, channels=1)
    sd.wait()  # Wait until recording is finished
    write(filename, fs, audio_data)
    print(f"Recording saved as {filename}")
    return filename

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# 🗣️ Voice Cloning with Coqui XTTS-v2")

    with gr.Row():
        text_input = gr.Textbox(label="Enter Text", placeholder="Type the text you want to synthesize...")
        speaker_audio_input = gr.Audio(label="Upload Speaker Audio (WAV)", type="filepath")
        language_dropdown = gr.Dropdown(
            label="Select Output Language",
            choices=list(LANGUAGES.keys()),
            value="English"
        )
        mic_button = gr.Button("Record from Mic")

    output_audio = gr.Audio(label="Generated Voice", type="filepath")

    generate_button = gr.Button("Generate Voice")

    mic_button.click(
        fn=lambda: record_audio(duration=10),
        inputs=[],
        outputs=speaker_audio_input,
    )

    generate_button.click(
        fn=generate_voice,
        inputs=[text_input, speaker_audio_input, language_dropdown],
        outputs=output_audio
    )

# Launch the app
demo.launch(server_name="0.0.0.0", server_port=7860, share=True)