File size: 3,030 Bytes
aad075c
69b575f
 
 
 
 
 
b858a70
 
8a4576f
9217229
69b575f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb30231
69b575f
 
eb30231
 
 
 
 
0053e1b
eb30231
08942fb
eb30231
69b575f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a26f4fc
69b575f
 
 
 
 
 
 
b858a70
a26f4fc
69b575f
 
fff9e6e
69b575f
a2590ba
69b575f
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import spaces
import torch
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from pathlib import Path
import gradio as gr

CONFIG_URL = 'https://huggingface.co/medmac01/darija_xtt_2.0/resolve/main/config.json'
VOCAB_URL = 'https://huggingface.co/medmac01/darija_xtt_2.0/resolve/main/vocab.json'
MODEL_URL = 'https://huggingface.co/medmac01/darija_xtt_2.0/resolve/main/model_2.1.pth'
SPEAKER_AUDIO_URL = 'https://huggingface.co/medmac01/xtt2_darija_v0.1/resolve/main/speaker_reference.wav'

base_path = Path(__file__).parent

# Download the files into the base_path
config_path = base_path / 'config.json'
if not config_path.exists():
    torch.hub.download_url_to_file(CONFIG_URL, config_path)
vocab_path = base_path / 'vocab.json'
if not vocab_path.exists():
    torch.hub.download_url_to_file(VOCAB_URL, vocab_path)
model_path = base_path / 'model.pth'
if not model_path.exists():
    torch.hub.download_url_to_file(MODEL_URL, model_path)
speaker_audio_path = base_path / 'speaker_reference.wav'
if not speaker_audio_path.exists():
    torch.hub.download_url_to_file(SPEAKER_AUDIO_URL, speaker_audio_path)

config_path = str(config_path)
vocab_path = str(vocab_path)
model_path = str(model_path)
speaker_audio_path = str(speaker_audio_path)

config = XttsConfig()
config.load_json(config_path)

print("Loading model...")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
model = Xtts.init_from_config(config)
model.load_checkpoint(config, checkpoint_path=model_path, use_deepspeed=False, vocab_path=vocab_path, eval=True)
model.to(device)

@spaces.GPU
def infer_EGTTS(text: str, speaker_audio_path: str, temperature: float = 0.75):
    print("Computing speaker latents...")
    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[speaker_audio_path])

    print("Inference...")
    out = model.inference(
        text,
        "ar",
        gpt_cond_latent,
        speaker_embedding,
        temperature=temperature,
    )

    return 24000, out["wav"]

markdown_description = """## Instructions:

1. Enter the text you want to synthesize.
2. Upload a 4-5 seconds audio file of the speaker you want to clone.
3. Click on the "Generate" button.

"""
with gr.Blocks(title="EGTTS") as app:
    gr.HTML("<center><h1>Moroccan-Darija-TTS </h1></center>")
    gr.Markdown(markdown_description)
    with gr.Row():
        with gr.Column():
            text = gr.Textbox(label="Text to synthesize", value="السلام عليكم ورحمة الله", rtl=True, text_align="right", lines=3)
            speaker_refrence = gr.Audio(label="Speaker reference", value=speaker_audio_path, type="filepath")
            temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.75, step=0.05)
            generate_btn = gr.Button(value="Generate", variant="primary")
        output = gr.Audio(label="Synthesized audio")
    
    generate_btn.click(infer_EGTTS, inputs=[text, speaker_refrence, temperature], outputs=output)

app.launch()