|
import gradio as gr |
|
import torch |
|
from TTS.api import TTS |
|
import os |
|
import tempfile |
|
|
|
os.environ["COQUI_TOS_AGREED"] = "1" |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
def load_tts_model(): |
|
return TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device) |
|
|
|
tts = load_tts_model() |
|
|
|
def clone(text, audio_file, language, speaking_rate, pitch, volume, |
|
emotion, sample_rate, temperature, seed): |
|
if seed is not None: |
|
torch.manual_seed(seed) |
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio: |
|
temp_audio_path = temp_audio.name |
|
|
|
tts.tts_to_file( |
|
text=text, |
|
speaker_wav=audio_file, |
|
language=language, |
|
file_path=temp_audio_path |
|
) |
|
|
|
return temp_audio_path |
|
|
|
|
|
iface = gr.Interface( |
|
fn=clone, |
|
inputs=[ |
|
gr.Textbox(label="Text"), |
|
gr.Audio(label="Voice reference audio file", type="filepath"), |
|
gr.Dropdown(["en", "es", "fr", "de", "it"], label="Language", value="en"), |
|
gr.Slider(0.5, 2.0, value=1.0, label="Speaking Rate"), |
|
gr.Slider(-10, 10, value=0, label="Pitch Adjustment"), |
|
gr.Slider(0.1, 2.0, value=1.0, label="Volume"), |
|
gr.Dropdown(["neutral", "happy", "sad", "angry"], label="Emotion", value="neutral"), |
|
gr.Dropdown([22050, 24000, 44100, 48000], label="Sample Rate", value=24000), |
|
gr.Slider(0.1, 1.0, value=0.8, label="Temperature"), |
|
gr.Number(label="Seed (optional)") |
|
], |
|
outputs=gr.Audio(label="Generated Audio"), |
|
title="Advanced Voice Clone", |
|
description="Customize your voice cloning experience with various parameters." |
|
) |
|
|
|
|
|
iface.launch() |
|
|
|
|
|
for file in os.listdir(): |
|
if file.endswith('.wav') and file.startswith('tmp'): |
|
os.remove(file) |