compare-vits-finetuned

Paused

App Files Files Community

ylacombe commited on Nov 29, 2023

Commit

b3965f2

1 Parent(s): e6561b0

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -43

app.py CHANGED Viewed

@@ -54,17 +54,18 @@ pipe_dict = {
     "language": "english",
 }
-title =      """# Explore MMS finetuning
-            ## Or how to access truely multilingual TTS
-            Massively Multilingual Speech (MMS) models are light-weight, low-latency TTS models based on the [VITS architecture](https://huggingface.co/docs/transformers/model_doc/vits).
-            Meta's [MMS](https://arxiv.org/abs/2305.13516) project, aiming to provide speech technology across a diverse range of languages. You can find more details about the supported languages and their ISO 639-3 codes in the [MMS Language Coverage Overview](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html),
-    and see all MMS-TTS checkpoints on the Hugging Face Hub: [facebook/mms-tts](https://huggingface.co/models?sort=trending&search=facebook%2Fmms-tts).
-            Coupled with the right data and the right training recipe, you can get an excellent finetuned version of every MMS checkpoints in **20 minutes** with as little as **80 to 150 samples**.
-            Stay tuned, the training recipe is coming soon!
             """
 max_speakers = 15
@@ -150,42 +151,62 @@ with gr.Blocks(css=css) as demo_blocks:
                 out_audio = gr.Audio(type="numpy", autoplay=False, label=f"Generated Audio - speaker {i}", show_label=True, visible=False)
                 outputs.append(out_audio)
-    gr.Markdown("""
-    ## Datasets and models details
-    For each language, we used 100 to 150 samples of a single speaker to finetune the model.
-    ### Spanish
-    * **Model**: [Spanish MMS TTS](https://huggingface.co/facebook/mms-tts-spa).
-    * **Datasets**:
-        - [Chilean Spanish TTS dataset](https://huggingface.co/datasets/ylacombe/google-chilean-spanish).
-    ### Tamil
-    * **Model**: [Tamil MMS TTS](https://huggingface.co/facebook/mms-tts-tam).
-    * **Datasets**:
-        - [Tamil TTS dataset](https://huggingface.co/datasets/ylacombe/google-tamil).
-    ### Gujarati
-    * **Model**: [Gujarati MMS TTS](https://huggingface.co/facebook/mms-tts-guj).
-    * **Datasets**:
-        - [Gujarati TTS dataset](https://huggingface.co/datasets/ylacombe/google-gujarati).
-    ### Marathi
-    * **Model**: [Marathi MMS TTS](https://huggingface.co/facebook/mms-tts-mar).
-    * **Datasets**:
-        - [Marathi TTS dataset](https://huggingface.co/datasets/ylacombe/google-chilean-marathi).
-    ### English
-    * **Model**: [VITS-ljs](https://huggingface.co/kakao-enterprise/vits-ljs)
-    * **Dataset**: [British Isles Accent](https://huggingface.co/datasets/ylacombe/english_dialects). For each accent, we used 100 to 150 samples of a single speaker to finetune [VITS-ljs](https://huggingface.co/kakao-enterprise/vits-ljs).
-                """)
     language.change(lambda language: gr.Dropdown(
                     models_per_language[language],

     "language": "english",
 }
+title =      """
+# Explore MMS finetuning
+## Or how to access truely multilingual TTS
+Massively Multilingual Speech (MMS) models are light-weight, low-latency TTS models based on the [VITS architecture](https://huggingface.co/docs/transformers/model_doc/vits).
+Meta's [MMS](https://arxiv.org/abs/2305.13516) project, aiming to provide speech technology across a diverse range of languages. You can find more details about the supported languages and their ISO 639-3 codes in the [MMS Language Coverage Overview](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html),
+and see all MMS-TTS checkpoints on the Hugging Face Hub: [facebook/mms-tts](https://huggingface.co/models?sort=trending&search=facebook%2Fmms-tts).
+Coupled with the right data and the right training recipe, you can get an excellent finetuned version of every MMS checkpoints in **20 minutes** with as little as **80 to 150 samples**.
+Stay tuned, the training recipe is coming soon!
             """
 max_speakers = 15
                 out_audio = gr.Audio(type="numpy", autoplay=False, label=f"Generated Audio - speaker {i}", show_label=True, visible=False)
                 outputs.append(out_audio)
+    with gr.Accordion("Datasets and models details", open=False):
+        gr.Markdown("""
+For each language, we used 100 to 150 samples of a single speaker to finetune the model.
+### Spanish
+* **Model**: [Spanish MMS TTS](https://huggingface.co/facebook/mms-tts-spa).
+* **Datasets**:
+    - [Chilean Spanish TTS dataset](https://huggingface.co/datasets/ylacombe/google-chilean-spanish).
+### Tamil
+* **Model**: [Tamil MMS TTS](https://huggingface.co/facebook/mms-tts-tam).
+* **Datasets**:
+    - [Tamil TTS dataset](https://huggingface.co/datasets/ylacombe/google-tamil).
+### Gujarati
+* **Model**: [Gujarati MMS TTS](https://huggingface.co/facebook/mms-tts-guj).
+* **Datasets**:
+    - [Gujarati TTS dataset](https://huggingface.co/datasets/ylacombe/google-gujarati).
+### Marathi
+* **Model**: [Marathi MMS TTS](https://huggingface.co/facebook/mms-tts-mar).
+* **Datasets**:
+    - [Marathi TTS dataset](https://huggingface.co/datasets/ylacombe/google-chilean-marathi).
+### English
+* **Model**: [VITS-ljs](https://huggingface.co/kakao-enterprise/vits-ljs)
+* **Dataset**: [British Isles Accent](https://huggingface.co/datasets/ylacombe/english_dialects). For each accent, we used 100 to 150 samples of a single speaker to finetune [VITS-ljs](https://huggingface.co/kakao-enterprise/vits-ljs).
+                    """)
+    with gr.Accordion("Run VITS and MMS with transformers", open=False):
+        gr.Markdown(
+            """
+        ```bash
+        pip install transformers
+        ```
+        ```py
+        from transformers import pipeline
+        import scipy
+        pipe = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=0)
+        results = pipe("A cinematic shot of a baby racoon wearing an intricate italian priest robe")
+        # write to a wav file
+        scipy.io.wavfile.write("audio_vits.wav", rate=results["sampling_rate"], data=results["audio"].squeeze())
+        ```
+        """
+        )
     language.change(lambda language: gr.Dropdown(
                     models_per_language[language],