File size: 5,996 Bytes
47103bb
 
810c095
47103bb
 
 
 
 
810c095
47103bb
810c095
 
 
 
 
 
 
 
 
47103bb
 
 
810c095
 
47103bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7da474
47103bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70da9fa
47103bb
810c095
47103bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fcf4cd0
47103bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fcf4cd0
47103bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
810c095
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import sys
import os
from fastapi import Request
# By using XTTS you agree to CPML license https://coqui.ai/cpml
os.environ["COQUI_TOS_AGREED"] = "1"

import gradio as gr
from TTS.api import TTS
from TTS.utils.manage import ModelManager
model_names = TTS().list_models()
print(model_names.__dict__)
print(model_names.__dir__())
model_name = "tts_models/multilingual/multi-dataset/xtts_v2" # move in v2, since xtts_v1 is generated keyerror, I guess you can select it with old github's release.

#m = ModelManager().download_model(model_name)
#print(m)
m = model_name

tts = TTS(model_name, gpu=False)
tts.to("cpu") # no GPU or Amd
#tts.to("cuda") # cuda only


def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, agree, request: gr.Request):
    if agree == True:
        if use_mic == True:
            if mic_file_path is not None:
                speaker_wav=mic_file_path
            else:
                gr.Warning("Please record your voice with Microphone, or uncheck Use Microphone to use reference audios")
                return (
                    None,
                    None,
                ) 
                
        else:
            speaker_wav=audio_file_pth

        if len(prompt)<2:
            gr.Warning("Please give a longer prompt text")
            return (
                    None,
                    None,
                )
        if len(prompt)>100000:
            gr.Warning("Text length limited to 10000 characters for this demo, please try shorter text")
            return (
                    None,
                    None,
                )  
        try:
            if language == "fr":
                if m.find("your") != -1:
                    language = "fr-fr"
            if m.find("/fr/") != -1:
                language = None
            tts.tts_to_file(
                text=prompt,
                file_path="output.wav",
                speaker_wav=speaker_wav,
                language=language
            )
        except RuntimeError as e :
            if "device-assert" in str(e):
                # cannot do anything on cuda device side error, need tor estart
                gr.Warning("Unhandled Exception encounter, please retry in a minute")
                print("Cuda device-assert Runtime encountered need restart")
                sys.exit("Exit due to cuda device-assert")
            else:
                raise e
            
        return (
            gr.make_waveform(
                audio="output.wav",
            ),
            "output.wav",
        )
    else:
        gr.Warning("Please accept the Terms & Condition!")
        return (
                None,
                None,
            ) 


title = "XTTS Voice Cloning"

description = f"""
<a href="https://huggingface.co/coqui/XTTS-v1">XTTS</a> is a Voice generation model that lets you clone voices into different languages by using just a quick 3-second audio clip. 
<br/>
XTTS is built on previous research, like Tortoise, with additional architectural innovations and training to make cross-language voice cloning and multilingual speech generation possible. 
<br/>
This is the same model that powers our creator application <a href="https://coqui.ai">Coqui Studio</a> as well as the <a href="https://docs.coqui.ai">Coqui API</a>. In production we apply modifications to make low-latency streaming possible.
<br/>
Leave a star on the Github <a href="https://github.com/coqui-ai/TTS">TTS</a>, where our open-source inference and training code lives.
<br/>
<p>For faster inference without waiting in the queue, you should duplicate this space and upgrade to GPU via the settings.
<br/>
<a href="https://huggingface.co/spaces/coqui/xtts?duplicate=true">
<img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
</p>
"""

article = """
<div style='margin:20px auto;'>
<p>By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml</p>
</div>
"""


gr.Interface(
    fn=predict,
    inputs=[
        gr.Textbox(
            label="Text Prompt",
            info="One or two sentences at a time is better",
            value="Buen día!!!,  que hagas ,  un buen dia!!!! este otra parte , es para generar un poco mas de texto, y entonaciones, para que quede un audio ,un poco mas largo , asi ,nos queda una buena muestra para cuando se necesita audios mas largos , lamejor calidad de audio va a brindar mejores resultaods ,hola mundo  !, te deseo un dia maravilloso  ...",
        ),
        gr.Dropdown(
            label="Language",
            info="Select an output language for the synthesised speech",
            choices=[
                "en",
                "es",
                "fr",
                "de",
                "it",
                "pt",
                "pl",
                "tr",
                "ru",
                "nl",
                "cs",
                "ar",
                "zh-cn",
            ],
            max_choices=1,
            value="es",
        ),
        gr.Audio(
            label="Reference Audio",
            info="Click on the ✎ button to upload your own target speaker audio",
            type="filepath",
        ),
        gr.Audio(source="microphone",
                 type="filepath",
                 info="Use your microphone to record audio",
                 label="Use Microphone for Reference"),
        gr.Checkbox(label="Check to use Microphone as Reference",
                    value=False,
                    info="Notice: Microphone input may not work properly under traffic",),
        gr.Checkbox(
            label="Agree",
            value=True,
            info="I agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml",
        ),
    ],
    outputs=[
        gr.Video(label="Waveform Visual"),
        gr.Audio(label="Synthesised Audio"),
    ],
    title=title,
    description=description,
    article=article,
).queue().launch(debug=True)