ylacombe commited on
Commit
b3965f2
·
1 Parent(s): e6561b0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -43
app.py CHANGED
@@ -54,17 +54,18 @@ pipe_dict = {
54
  "language": "english",
55
  }
56
 
57
- title = """# Explore MMS finetuning
58
- ## Or how to access truely multilingual TTS
 
59
 
60
- Massively Multilingual Speech (MMS) models are light-weight, low-latency TTS models based on the [VITS architecture](https://huggingface.co/docs/transformers/model_doc/vits).
61
-
62
- Meta's [MMS](https://arxiv.org/abs/2305.13516) project, aiming to provide speech technology across a diverse range of languages. You can find more details about the supported languages and their ISO 639-3 codes in the [MMS Language Coverage Overview](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html),
63
- and see all MMS-TTS checkpoints on the Hugging Face Hub: [facebook/mms-tts](https://huggingface.co/models?sort=trending&search=facebook%2Fmms-tts).
64
-
65
- Coupled with the right data and the right training recipe, you can get an excellent finetuned version of every MMS checkpoints in **20 minutes** with as little as **80 to 150 samples**.
66
-
67
- Stay tuned, the training recipe is coming soon!
68
  """
69
 
70
  max_speakers = 15
@@ -150,42 +151,62 @@ with gr.Blocks(css=css) as demo_blocks:
150
  out_audio = gr.Audio(type="numpy", autoplay=False, label=f"Generated Audio - speaker {i}", show_label=True, visible=False)
151
  outputs.append(out_audio)
152
 
153
- gr.Markdown("""
154
- ## Datasets and models details
155
-
156
- For each language, we used 100 to 150 samples of a single speaker to finetune the model.
157
-
158
- ### Spanish
159
-
160
- * **Model**: [Spanish MMS TTS](https://huggingface.co/facebook/mms-tts-spa).
161
- * **Datasets**:
162
- - [Chilean Spanish TTS dataset](https://huggingface.co/datasets/ylacombe/google-chilean-spanish).
163
 
164
- ### Tamil
165
-
166
- * **Model**: [Tamil MMS TTS](https://huggingface.co/facebook/mms-tts-tam).
167
- * **Datasets**:
168
- - [Tamil TTS dataset](https://huggingface.co/datasets/ylacombe/google-tamil).
169
 
170
- ### Gujarati
171
-
172
- * **Model**: [Gujarati MMS TTS](https://huggingface.co/facebook/mms-tts-guj).
173
- * **Datasets**:
174
- - [Gujarati TTS dataset](https://huggingface.co/datasets/ylacombe/google-gujarati).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
- ### Marathi
177
-
178
- * **Model**: [Marathi MMS TTS](https://huggingface.co/facebook/mms-tts-mar).
179
- * **Datasets**:
180
- - [Marathi TTS dataset](https://huggingface.co/datasets/ylacombe/google-chilean-marathi).
181
-
182
- ### English
183
-
184
- * **Model**: [VITS-ljs](https://huggingface.co/kakao-enterprise/vits-ljs)
185
- * **Dataset**: [British Isles Accent](https://huggingface.co/datasets/ylacombe/english_dialects). For each accent, we used 100 to 150 samples of a single speaker to finetune [VITS-ljs](https://huggingface.co/kakao-enterprise/vits-ljs).
186
-
187
-
188
- """)
189
 
190
  language.change(lambda language: gr.Dropdown(
191
  models_per_language[language],
 
54
  "language": "english",
55
  }
56
 
57
+ title = """
58
+ # Explore MMS finetuning
59
+ ## Or how to access truely multilingual TTS
60
 
61
+ Massively Multilingual Speech (MMS) models are light-weight, low-latency TTS models based on the [VITS architecture](https://huggingface.co/docs/transformers/model_doc/vits).
62
+
63
+ Meta's [MMS](https://arxiv.org/abs/2305.13516) project, aiming to provide speech technology across a diverse range of languages. You can find more details about the supported languages and their ISO 639-3 codes in the [MMS Language Coverage Overview](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html),
64
+ and see all MMS-TTS checkpoints on the Hugging Face Hub: [facebook/mms-tts](https://huggingface.co/models?sort=trending&search=facebook%2Fmms-tts).
65
+
66
+ Coupled with the right data and the right training recipe, you can get an excellent finetuned version of every MMS checkpoints in **20 minutes** with as little as **80 to 150 samples**.
67
+
68
+ Stay tuned, the training recipe is coming soon!
69
  """
70
 
71
  max_speakers = 15
 
151
  out_audio = gr.Audio(type="numpy", autoplay=False, label=f"Generated Audio - speaker {i}", show_label=True, visible=False)
152
  outputs.append(out_audio)
153
 
154
+ with gr.Accordion("Datasets and models details", open=False):
155
+ gr.Markdown("""
156
+
157
+ For each language, we used 100 to 150 samples of a single speaker to finetune the model.
 
 
 
 
 
 
158
 
159
+ ### Spanish
 
 
 
 
160
 
161
+ * **Model**: [Spanish MMS TTS](https://huggingface.co/facebook/mms-tts-spa).
162
+ * **Datasets**:
163
+ - [Chilean Spanish TTS dataset](https://huggingface.co/datasets/ylacombe/google-chilean-spanish).
164
+
165
+ ### Tamil
166
+
167
+ * **Model**: [Tamil MMS TTS](https://huggingface.co/facebook/mms-tts-tam).
168
+ * **Datasets**:
169
+ - [Tamil TTS dataset](https://huggingface.co/datasets/ylacombe/google-tamil).
170
+
171
+ ### Gujarati
172
+
173
+ * **Model**: [Gujarati MMS TTS](https://huggingface.co/facebook/mms-tts-guj).
174
+ * **Datasets**:
175
+ - [Gujarati TTS dataset](https://huggingface.co/datasets/ylacombe/google-gujarati).
176
+
177
+ ### Marathi
178
+
179
+ * **Model**: [Marathi MMS TTS](https://huggingface.co/facebook/mms-tts-mar).
180
+ * **Datasets**:
181
+ - [Marathi TTS dataset](https://huggingface.co/datasets/ylacombe/google-chilean-marathi).
182
+
183
+ ### English
184
+
185
+ * **Model**: [VITS-ljs](https://huggingface.co/kakao-enterprise/vits-ljs)
186
+ * **Dataset**: [British Isles Accent](https://huggingface.co/datasets/ylacombe/english_dialects). For each accent, we used 100 to 150 samples of a single speaker to finetune [VITS-ljs](https://huggingface.co/kakao-enterprise/vits-ljs).
187
+
188
+
189
+ """)
190
+
191
+ with gr.Accordion("Run VITS and MMS with transformers", open=False):
192
+ gr.Markdown(
193
+ """
194
+ ```bash
195
+ pip install transformers
196
+ ```
197
+ ```py
198
+ from transformers import pipeline
199
+ import scipy
200
+ pipe = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=0)
201
+
202
+ results = pipe("A cinematic shot of a baby racoon wearing an intricate italian priest robe")
203
+
204
+ # write to a wav file
205
+ scipy.io.wavfile.write("audio_vits.wav", rate=results["sampling_rate"], data=results["audio"].squeeze())
206
+ ```
207
+ """
208
+ )
209
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
  language.change(lambda language: gr.Dropdown(
212
  models_per_language[language],