ylacombe commited on
Commit
e6561b0
·
1 Parent(s): 1bef11b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -11
app.py CHANGED
@@ -54,13 +54,20 @@ pipe_dict = {
54
  "language": "english",
55
  }
56
 
57
- title = "# 🐶 VITS"
 
58
 
59
- max_speakers = 15
60
-
61
- description = """
 
 
 
 
 
 
62
 
63
- """
64
 
65
 
66
  # Inference
@@ -104,13 +111,24 @@ def generate_audio(text, model_id, language):
104
  return out
105
 
106
 
 
 
 
 
 
 
 
 
 
 
 
107
  # Gradio blocks demo
108
- with gr.Blocks() as demo_blocks:
109
- gr.Markdown(title)
110
- gr.Markdown(description)
111
  with gr.Row():
112
  with gr.Column():
113
- inp_text = gr.Textbox(label="Input Text", info="What would you like VITS to synthesise?")
114
  btn = gr.Button("Generate Audio!")
115
  language = gr.Dropdown(
116
  default_model_per_language.keys(),
@@ -120,18 +138,55 @@ with gr.Blocks() as demo_blocks:
120
  )
121
 
122
  model_id = gr.Dropdown(
123
- models_per_language["english"],
124
  value="ylacombe/mms-spa-finetuned-chilean-monospeaker",
125
  label="Model",
126
  info="Model you want to test",
127
  )
128
-
129
  with gr.Column():
130
  outputs = []
131
  for i in range(max_speakers):
132
  out_audio = gr.Audio(type="numpy", autoplay=False, label=f"Generated Audio - speaker {i}", show_label=True, visible=False)
133
  outputs.append(out_audio)
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  language.change(lambda language: gr.Dropdown(
136
  models_per_language[language],
137
  value=models_per_language[language][0],
 
54
  "language": "english",
55
  }
56
 
57
+ title = """# Explore MMS finetuning
58
+ ## Or how to access truely multilingual TTS
59
 
60
+ Massively Multilingual Speech (MMS) models are light-weight, low-latency TTS models based on the [VITS architecture](https://huggingface.co/docs/transformers/model_doc/vits).
61
+
62
+ Meta's [MMS](https://arxiv.org/abs/2305.13516) project, aiming to provide speech technology across a diverse range of languages. You can find more details about the supported languages and their ISO 639-3 codes in the [MMS Language Coverage Overview](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html),
63
+ and see all MMS-TTS checkpoints on the Hugging Face Hub: [facebook/mms-tts](https://huggingface.co/models?sort=trending&search=facebook%2Fmms-tts).
64
+
65
+ Coupled with the right data and the right training recipe, you can get an excellent finetuned version of every MMS checkpoints in **20 minutes** with as little as **80 to 150 samples**.
66
+
67
+ Stay tuned, the training recipe is coming soon!
68
+ """
69
 
70
+ max_speakers = 15
71
 
72
 
73
  # Inference
 
111
  return out
112
 
113
 
114
+ css = """
115
+ #container{
116
+ margin: 0 auto;
117
+ max-width: 80rem;
118
+ }
119
+ #intro{
120
+ max-width: 100%;
121
+ text-align: center;
122
+ margin: 0 auto;
123
+ }
124
+ """
125
  # Gradio blocks demo
126
+ with gr.Blocks(css=css) as demo_blocks:
127
+ gr.Markdown(title, elem_id="intro")
128
+
129
  with gr.Row():
130
  with gr.Column():
131
+ inp_text = gr.Textbox(label="Input Text", info="What sentence would you like to synthesise?")
132
  btn = gr.Button("Generate Audio!")
133
  language = gr.Dropdown(
134
  default_model_per_language.keys(),
 
138
  )
139
 
140
  model_id = gr.Dropdown(
141
+ models_per_language["spanish"],
142
  value="ylacombe/mms-spa-finetuned-chilean-monospeaker",
143
  label="Model",
144
  info="Model you want to test",
145
  )
146
+
147
  with gr.Column():
148
  outputs = []
149
  for i in range(max_speakers):
150
  out_audio = gr.Audio(type="numpy", autoplay=False, label=f"Generated Audio - speaker {i}", show_label=True, visible=False)
151
  outputs.append(out_audio)
152
 
153
+ gr.Markdown("""
154
+ ## Datasets and models details
155
+
156
+ For each language, we used 100 to 150 samples of a single speaker to finetune the model.
157
+
158
+ ### Spanish
159
+
160
+ * **Model**: [Spanish MMS TTS](https://huggingface.co/facebook/mms-tts-spa).
161
+ * **Datasets**:
162
+ - [Chilean Spanish TTS dataset](https://huggingface.co/datasets/ylacombe/google-chilean-spanish).
163
+
164
+ ### Tamil
165
+
166
+ * **Model**: [Tamil MMS TTS](https://huggingface.co/facebook/mms-tts-tam).
167
+ * **Datasets**:
168
+ - [Tamil TTS dataset](https://huggingface.co/datasets/ylacombe/google-tamil).
169
+
170
+ ### Gujarati
171
+
172
+ * **Model**: [Gujarati MMS TTS](https://huggingface.co/facebook/mms-tts-guj).
173
+ * **Datasets**:
174
+ - [Gujarati TTS dataset](https://huggingface.co/datasets/ylacombe/google-gujarati).
175
+
176
+ ### Marathi
177
+
178
+ * **Model**: [Marathi MMS TTS](https://huggingface.co/facebook/mms-tts-mar).
179
+ * **Datasets**:
180
+ - [Marathi TTS dataset](https://huggingface.co/datasets/ylacombe/google-chilean-marathi).
181
+
182
+ ### English
183
+
184
+ * **Model**: [VITS-ljs](https://huggingface.co/kakao-enterprise/vits-ljs)
185
+ * **Dataset**: [British Isles Accent](https://huggingface.co/datasets/ylacombe/english_dialects). For each accent, we used 100 to 150 samples of a single speaker to finetune [VITS-ljs](https://huggingface.co/kakao-enterprise/vits-ljs).
186
+
187
+
188
+ """)
189
+
190
  language.change(lambda language: gr.Dropdown(
191
  models_per_language[language],
192
  value=models_per_language[language][0],