Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,030 Bytes
aad075c 69b575f b858a70 8a4576f 9217229 69b575f eb30231 69b575f eb30231 0053e1b eb30231 08942fb eb30231 69b575f a26f4fc 69b575f b858a70 a26f4fc 69b575f fff9e6e 69b575f a2590ba 69b575f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import spaces
import torch
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from pathlib import Path
import gradio as gr
CONFIG_URL = 'https://huggingface.co/medmac01/darija_xtt_2.0/resolve/main/config.json'
VOCAB_URL = 'https://huggingface.co/medmac01/darija_xtt_2.0/resolve/main/vocab.json'
MODEL_URL = 'https://huggingface.co/medmac01/darija_xtt_2.0/resolve/main/model_2.1.pth'
SPEAKER_AUDIO_URL = 'https://huggingface.co/medmac01/xtt2_darija_v0.1/resolve/main/speaker_reference.wav'
base_path = Path(__file__).parent
# Download the files into the base_path
config_path = base_path / 'config.json'
if not config_path.exists():
torch.hub.download_url_to_file(CONFIG_URL, config_path)
vocab_path = base_path / 'vocab.json'
if not vocab_path.exists():
torch.hub.download_url_to_file(VOCAB_URL, vocab_path)
model_path = base_path / 'model.pth'
if not model_path.exists():
torch.hub.download_url_to_file(MODEL_URL, model_path)
speaker_audio_path = base_path / 'speaker_reference.wav'
if not speaker_audio_path.exists():
torch.hub.download_url_to_file(SPEAKER_AUDIO_URL, speaker_audio_path)
config_path = str(config_path)
vocab_path = str(vocab_path)
model_path = str(model_path)
speaker_audio_path = str(speaker_audio_path)
config = XttsConfig()
config.load_json(config_path)
print("Loading model...")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
model = Xtts.init_from_config(config)
model.load_checkpoint(config, checkpoint_path=model_path, use_deepspeed=False, vocab_path=vocab_path, eval=True)
model.to(device)
@spaces.GPU
def infer_EGTTS(text: str, speaker_audio_path: str, temperature: float = 0.75):
print("Computing speaker latents...")
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[speaker_audio_path])
print("Inference...")
out = model.inference(
text,
"ar",
gpt_cond_latent,
speaker_embedding,
temperature=temperature,
)
return 24000, out["wav"]
markdown_description = """## Instructions:
1. Enter the text you want to synthesize.
2. Upload a 4-5 seconds audio file of the speaker you want to clone.
3. Click on the "Generate" button.
"""
with gr.Blocks(title="EGTTS") as app:
gr.HTML("<center><h1>Moroccan-Darija-TTS </h1></center>")
gr.Markdown(markdown_description)
with gr.Row():
with gr.Column():
text = gr.Textbox(label="Text to synthesize", value="السلام عليكم ورحمة الله", rtl=True, text_align="right", lines=3)
speaker_refrence = gr.Audio(label="Speaker reference", value=speaker_audio_path, type="filepath")
temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.75, step=0.05)
generate_btn = gr.Button(value="Generate", variant="primary")
output = gr.Audio(label="Synthesized audio")
generate_btn.click(infer_EGTTS, inputs=[text, speaker_refrence, temperature], outputs=output)
app.launch() |