multi_parler_tts

Running on Zero

App Files Files Community

PHBJT commited on Oct 30, 2024

Commit

a1cb9c1

verified ·

1 Parent(s): 453ef66

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -28

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
-repo_id =  "PHBJT/french_parler_tts_mini_v0.1"
 model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
 tokenizer = AutoTokenizer.from_pretrained(repo_id)
@@ -25,26 +25,54 @@ SEED = 42
 default_text = "La voix humaine est un instrument de musique au-dessus de tous les autres."
 default_description = "A male voice speaks slowly with a very noisy background, displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue."
 examples = [
-[
-"La voix humaine est un instrument de musique au-dessus de tous les autres.",
-"A male voice speaks slowly with a very noisy background, displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue.",
-None,
-],
-[
-"Tout ce qu'un homme est capable d'imaginer, d'autres hommes seront capables de le réaliser.",
-"A male voice delivers a slightly expressive and animated speech with a moderate speed. The recording features a low-pitch voice, creating a close-sounding audio experience.",
-None,
-],
-[
-"La machine elle-même, si perfectionnée qu'on la suppose, n'est qu'un outil.",
-"A male voice provides a monotone yet slightly fast delivery, with a very close recording that almost has no background noise.",
-None,
-],
-[
-"Le progrès fait naître plus de besoins qu'il n'en satisfait.",
-"A female voice, in a very poor recording quality, delivers slightly expressive and animated words with a fast pace. There's a high level of background noise and a very distant-sounding reverberation. The voice is slightly higher pitched than average.",
-None,
-],
 ]
 number_normalizer = EnglishNumberNormalizer()
@@ -134,16 +162,13 @@ with gr.Blocks(css=css) as block:
         """
     )
     gr.HTML(
-        f"""
        <p><a href="https://github.com/huggingface/parler-tts">Parler-TTS</a> is a training and inference library for
 high-fidelity text-to-speech (TTS) models.</p>
-<p>The model demonstrated here, French Parler-TTS <a href="https://huggingface.co/PHBJT/french_parler_tts_mini_v0.1">Mini v0.1 French</a>,
-has been fine-tuned on a French dataset. It generates high-quality speech with features that can be controlled using a simple text prompt (e.g. gender, background noise, speaking rate, pitch and reverberation).
-Due to limitations on the dataset, this model might underperform for female voices (we recommend using male voices only).</p>
-<p>By default, Parler-TTS generates 🎲 random male voice characteristics. To ensure 🎯 <b>speaker consistency</b> across generations, try to use consistent descriptions in your prompts.</p>
-<p><b>Note:</b> do NOT specify the nationnality of the speaker it will cause inconsistent audio generation (do: "a male speaker", don't: "a french male speaker") </p>
-<p><b>Important note:</b> this model does NOT work in english, it will generate incoherent audios. But you can still use the original Parler TTS model for that. </p>
         """
     )
     with gr.Row():

 device = "cuda:0" if torch.cuda.is_available() else "cpu"
+repo_id =  "https://huggingface.co/ylacombe/p-m-e"
 model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
 tokenizer = AutoTokenizer.from_pretrained(repo_id)
 default_text = "La voix humaine est un instrument de musique au-dessus de tous les autres."
 default_description = "A male voice speaks slowly with a very noisy background, displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue."
 examples = [
+    # French
+    [
+        "La voix humaine est un instrument de musique au-dessus de tous les autres.",
+        "A male voice speaks slowly with a very noisy background, displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue.",
+        None,
+    ],
+    # Spanish
+    [
+        "La voz es el reflejo del alma en el espejo del tiempo.",
+        "A female voice speaks with moderate speed, showing warmth and clarity. The recording is clean with minimal background noise and has natural resonance.",
+        None,
+    ],
+    # Italian
+    [
+        "La voce umana è la più bella musica che esista al mondo.",
+        "A male voice delivers the message with passion and depth. The recording has good clarity with slight room acoustics and a medium-distance perspective.",
+        None,
+    ],
+    # Portuguese
+    [
+        "A voz é o espelho da alma e o som do coração.",
+        "A young female voice speaks with enthusiasm and energy. The recording is close-miked with crisp audio quality and subtle room ambiance.",
+        None,
+    ],
+    # Polish
+    [
+        "Głos ludzki jest najpiękniejszym instrumentem świata.",
+        "An elderly male voice speaks with wisdom and gravitas. The recording has a vintage quality with some characteristic analog warmth.",
+        None,
+    ],
+    # German
+    [
+        "Die menschliche Stimme ist das schönste Instrument der Welt.",
+        "A mature female voice speaks with authority and precision. The recording is studio-quality with perfect clarity and no background noise.",
+        None,
+    ],
+    # Dutch
+    [
+        "De menselijke stem is het mooiste instrument dat er bestaat.",
+        "A middle-aged male voice speaks with gentle inflection and warmth. The recording has natural room acoustics and balanced frequency response.",
+        None,
+    ],
+    # English
+    [
+        "The human voice is nature's most perfect instrument.",
+        "A young male voice speaks with dynamic expression and energy. The recording is professional quality with subtle environmental ambiance.",
+        None,
+    ],
 ]
 number_normalizer = EnglishNumberNormalizer()
         """
     )
     gr.HTML(
+f"""
        <p><a href="https://github.com/huggingface/parler-tts">Parler-TTS</a> is a training and inference library for
 high-fidelity text-to-speech (TTS) models.</p>
+<p>This multilingual model supports French, Spanish, Italian, Portuguese, Polish, German, Dutch, and English. It generates high-quality speech with features that can be controlled using a simple text prompt (e.g. gender, background noise, speaking rate, pitch and reverberation). </p>
+<p>By default, Parler-TTS generates 🎲 random voice characteristics. To ensure 🎯 <b>speaker consistency</b> across generations, try to use consistent descriptions in your prompts.</p>
+<p><b>Note:</b> you do not need to specify the nationality of the speaker in the description (do: "a male speaker", don't: "a french male speaker") </p>
         """
     )
     with gr.Row():