Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -12,7 +12,7 @@ from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
|
|
12 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
13 |
|
14 |
|
15 |
-
repo_id = "
|
16 |
|
17 |
model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
|
18 |
tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
@@ -25,26 +25,54 @@ SEED = 42
|
|
25 |
default_text = "La voix humaine est un instrument de musique au-dessus de tous les autres."
|
26 |
default_description = "A male voice speaks slowly with a very noisy background, displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue."
|
27 |
examples = [
|
28 |
-
|
29 |
-
|
30 |
-
"
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
]
|
49 |
number_normalizer = EnglishNumberNormalizer()
|
50 |
|
@@ -134,16 +162,13 @@ with gr.Blocks(css=css) as block:
|
|
134 |
"""
|
135 |
)
|
136 |
gr.HTML(
|
137 |
-
|
138 |
<p><a href="https://github.com/huggingface/parler-tts">Parler-TTS</a> is a training and inference library for
|
139 |
high-fidelity text-to-speech (TTS) models.</p>
|
140 |
-
<p>
|
141 |
-
has been fine-tuned on a French dataset. It generates high-quality speech with features that can be controlled using a simple text prompt (e.g. gender, background noise, speaking rate, pitch and reverberation).
|
142 |
-
Due to limitations on the dataset, this model might underperform for female voices (we recommend using male voices only).</p>
|
143 |
|
144 |
-
<p>By default, Parler-TTS generates 🎲 random
|
145 |
-
<p><b>Note:</b> do
|
146 |
-
<p><b>Important note:</b> this model does NOT work in english, it will generate incoherent audios. But you can still use the original Parler TTS model for that. </p>
|
147 |
"""
|
148 |
)
|
149 |
with gr.Row():
|
|
|
12 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
13 |
|
14 |
|
15 |
+
repo_id = "https://huggingface.co/ylacombe/p-m-e"
|
16 |
|
17 |
model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
|
18 |
tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
|
|
25 |
default_text = "La voix humaine est un instrument de musique au-dessus de tous les autres."
|
26 |
default_description = "A male voice speaks slowly with a very noisy background, displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue."
|
27 |
examples = [
|
28 |
+
# French
|
29 |
+
[
|
30 |
+
"La voix humaine est un instrument de musique au-dessus de tous les autres.",
|
31 |
+
"A male voice speaks slowly with a very noisy background, displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue.",
|
32 |
+
None,
|
33 |
+
],
|
34 |
+
# Spanish
|
35 |
+
[
|
36 |
+
"La voz es el reflejo del alma en el espejo del tiempo.",
|
37 |
+
"A female voice speaks with moderate speed, showing warmth and clarity. The recording is clean with minimal background noise and has natural resonance.",
|
38 |
+
None,
|
39 |
+
],
|
40 |
+
# Italian
|
41 |
+
[
|
42 |
+
"La voce umana è la più bella musica che esista al mondo.",
|
43 |
+
"A male voice delivers the message with passion and depth. The recording has good clarity with slight room acoustics and a medium-distance perspective.",
|
44 |
+
None,
|
45 |
+
],
|
46 |
+
# Portuguese
|
47 |
+
[
|
48 |
+
"A voz é o espelho da alma e o som do coração.",
|
49 |
+
"A young female voice speaks with enthusiasm and energy. The recording is close-miked with crisp audio quality and subtle room ambiance.",
|
50 |
+
None,
|
51 |
+
],
|
52 |
+
# Polish
|
53 |
+
[
|
54 |
+
"Głos ludzki jest najpiękniejszym instrumentem świata.",
|
55 |
+
"An elderly male voice speaks with wisdom and gravitas. The recording has a vintage quality with some characteristic analog warmth.",
|
56 |
+
None,
|
57 |
+
],
|
58 |
+
# German
|
59 |
+
[
|
60 |
+
"Die menschliche Stimme ist das schönste Instrument der Welt.",
|
61 |
+
"A mature female voice speaks with authority and precision. The recording is studio-quality with perfect clarity and no background noise.",
|
62 |
+
None,
|
63 |
+
],
|
64 |
+
# Dutch
|
65 |
+
[
|
66 |
+
"De menselijke stem is het mooiste instrument dat er bestaat.",
|
67 |
+
"A middle-aged male voice speaks with gentle inflection and warmth. The recording has natural room acoustics and balanced frequency response.",
|
68 |
+
None,
|
69 |
+
],
|
70 |
+
# English
|
71 |
+
[
|
72 |
+
"The human voice is nature's most perfect instrument.",
|
73 |
+
"A young male voice speaks with dynamic expression and energy. The recording is professional quality with subtle environmental ambiance.",
|
74 |
+
None,
|
75 |
+
],
|
76 |
]
|
77 |
number_normalizer = EnglishNumberNormalizer()
|
78 |
|
|
|
162 |
"""
|
163 |
)
|
164 |
gr.HTML(
|
165 |
+
f"""
|
166 |
<p><a href="https://github.com/huggingface/parler-tts">Parler-TTS</a> is a training and inference library for
|
167 |
high-fidelity text-to-speech (TTS) models.</p>
|
168 |
+
<p>This multilingual model supports French, Spanish, Italian, Portuguese, Polish, German, Dutch, and English. It generates high-quality speech with features that can be controlled using a simple text prompt (e.g. gender, background noise, speaking rate, pitch and reverberation). </p>
|
|
|
|
|
169 |
|
170 |
+
<p>By default, Parler-TTS generates 🎲 random voice characteristics. To ensure 🎯 <b>speaker consistency</b> across generations, try to use consistent descriptions in your prompts.</p>
|
171 |
+
<p><b>Note:</b> you do not need to specify the nationality of the speaker in the description (do: "a male speaker", don't: "a french male speaker") </p>
|
|
|
172 |
"""
|
173 |
)
|
174 |
with gr.Row():
|