Duy-NM commited on
Commit
70ea763
·
1 Parent(s): c578873
app.py ADDED
@@ -0,0 +1,723 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ from __future__ import annotations
8
+
9
+ import gradio as gr
10
+ import numpy as np
11
+ import torch
12
+ import torchaudio
13
+ from huggingface_hub import hf_hub_download
14
+ from seamless_communication.models.inference.translator import Translator
15
+
16
+ DESCRIPTION = """
17
+
18
+ # SM4T
19
+
20
+ Ứng dụng có thể chuyển đổi giọng nói hoặc chữ viết sang giọng nói hoặc chữ viết của một ngôn ngữ khác.
21
+ \nHiện tại SM4T đã hỗ trợ 94 ngôn ngữ khác nhau.
22
+
23
+ """
24
+
25
+ TASK_NAMES = [
26
+ "S2ST (Speech to Speech translation)",
27
+ "S2TT (Speech to Text translation)",
28
+ "T2ST (Text to Speech translation)",
29
+ "T2TT (Text to Text translation)",
30
+ "ASR (Automatic Speech Recognition)",
31
+ ]
32
+
33
+ # Language dict
34
+ language_code_to_name = {
35
+ "afr": "Afrikaans",
36
+ "amh": "Amharic",
37
+ "arb": "Modern Standard Arabic",
38
+ "ary": "Moroccan Arabic",
39
+ "arz": "Egyptian Arabic",
40
+ "asm": "Assamese",
41
+ "ast": "Asturian",
42
+ "azj": "North Azerbaijani",
43
+ "bel": "Belarusian",
44
+ "ben": "Bengali",
45
+ "bos": "Bosnian",
46
+ "bul": "Bulgarian",
47
+ "cat": "Catalan",
48
+ "ceb": "Cebuano",
49
+ "ces": "Czech",
50
+ "ckb": "Central Kurdish",
51
+ "cmn": "Mandarin Chinese",
52
+ "cym": "Welsh",
53
+ "dan": "Danish",
54
+ "deu": "German",
55
+ "ell": "Greek",
56
+ "eng": "English",
57
+ "est": "Estonian",
58
+ "eus": "Basque",
59
+ "fin": "Finnish",
60
+ "fra": "French",
61
+ "gaz": "West Central Oromo",
62
+ "gle": "Irish",
63
+ "glg": "Galician",
64
+ "guj": "Gujarati",
65
+ "heb": "Hebrew",
66
+ "hin": "Hindi",
67
+ "hrv": "Croatian",
68
+ "hun": "Hungarian",
69
+ "hye": "Armenian",
70
+ "ibo": "Igbo",
71
+ "ind": "Indonesian",
72
+ "isl": "Icelandic",
73
+ "ita": "Italian",
74
+ "jav": "Javanese",
75
+ "jpn": "Japanese",
76
+ "kam": "Kamba",
77
+ "kan": "Kannada",
78
+ "kat": "Georgian",
79
+ "kaz": "Kazakh",
80
+ "kea": "Kabuverdianu",
81
+ "khk": "Halh Mongolian",
82
+ "khm": "Khmer",
83
+ "kir": "Kyrgyz",
84
+ "kor": "Korean",
85
+ "lao": "Lao",
86
+ "lit": "Lithuanian",
87
+ "ltz": "Luxembourgish",
88
+ "lug": "Ganda",
89
+ "luo": "Luo",
90
+ "lvs": "Standard Latvian",
91
+ "mai": "Maithili",
92
+ "mal": "Malayalam",
93
+ "mar": "Marathi",
94
+ "mkd": "Macedonian",
95
+ "mlt": "Maltese",
96
+ "mni": "Meitei",
97
+ "mya": "Burmese",
98
+ "nld": "Dutch",
99
+ "nno": "Norwegian Nynorsk",
100
+ "nob": "Norwegian Bokm\u00e5l",
101
+ "npi": "Nepali",
102
+ "nya": "Nyanja",
103
+ "oci": "Occitan",
104
+ "ory": "Odia",
105
+ "pan": "Punjabi",
106
+ "pbt": "Southern Pashto",
107
+ "pes": "Western Persian",
108
+ "pol": "Polish",
109
+ "por": "Portuguese",
110
+ "ron": "Romanian",
111
+ "rus": "Russian",
112
+ "slk": "Slovak",
113
+ "slv": "Slovenian",
114
+ "sna": "Shona",
115
+ "snd": "Sindhi",
116
+ "som": "Somali",
117
+ "spa": "Spanish",
118
+ "srp": "Serbian",
119
+ "swe": "Swedish",
120
+ "swh": "Swahili",
121
+ "tam": "Tamil",
122
+ "tel": "Telugu",
123
+ "tgk": "Tajik",
124
+ "tgl": "Tagalog",
125
+ "tha": "Thai",
126
+ "tur": "Turkish",
127
+ "ukr": "Ukrainian",
128
+ "urd": "Urdu",
129
+ "uzn": "Northern Uzbek",
130
+ "vie": "Vietnamese",
131
+ "xho": "Xhosa",
132
+ "yor": "Yoruba",
133
+ "yue": "Cantonese",
134
+ "zlm": "Colloquial Malay",
135
+ "zsm": "Standard Malay",
136
+ "zul": "Zulu",
137
+ }
138
+ LANGUAGE_NAME_TO_CODE = {v: k for k, v in language_code_to_name.items()}
139
+
140
+ # Source langs: S2ST / S2TT / ASR don't need source lang
141
+ # T2TT / T2ST use this
142
+ text_source_language_codes = [
143
+ "afr",
144
+ "amh",
145
+ "arb",
146
+ "ary",
147
+ "arz",
148
+ "asm",
149
+ "azj",
150
+ "bel",
151
+ "ben",
152
+ "bos",
153
+ "bul",
154
+ "cat",
155
+ "ceb",
156
+ "ces",
157
+ "ckb",
158
+ "cmn",
159
+ "cym",
160
+ "dan",
161
+ "deu",
162
+ "ell",
163
+ "eng",
164
+ "est",
165
+ "eus",
166
+ "fin",
167
+ "fra",
168
+ "gaz",
169
+ "gle",
170
+ "glg",
171
+ "guj",
172
+ "heb",
173
+ "hin",
174
+ "hrv",
175
+ "hun",
176
+ "hye",
177
+ "ibo",
178
+ "ind",
179
+ "isl",
180
+ "ita",
181
+ "jav",
182
+ "jpn",
183
+ "kan",
184
+ "kat",
185
+ "kaz",
186
+ "khk",
187
+ "khm",
188
+ "kir",
189
+ "kor",
190
+ "lao",
191
+ "lit",
192
+ "lug",
193
+ "luo",
194
+ "lvs",
195
+ "mai",
196
+ "mal",
197
+ "mar",
198
+ "mkd",
199
+ "mlt",
200
+ "mni",
201
+ "mya",
202
+ "nld",
203
+ "nno",
204
+ "nob",
205
+ "npi",
206
+ "nya",
207
+ "ory",
208
+ "pan",
209
+ "pbt",
210
+ "pes",
211
+ "pol",
212
+ "por",
213
+ "ron",
214
+ "rus",
215
+ "slk",
216
+ "slv",
217
+ "sna",
218
+ "snd",
219
+ "som",
220
+ "spa",
221
+ "srp",
222
+ "swe",
223
+ "swh",
224
+ "tam",
225
+ "tel",
226
+ "tgk",
227
+ "tgl",
228
+ "tha",
229
+ "tur",
230
+ "ukr",
231
+ "urd",
232
+ "uzn",
233
+ "vie",
234
+ "yor",
235
+ "yue",
236
+ "zsm",
237
+ "zul",
238
+ ]
239
+ TEXT_SOURCE_LANGUAGE_NAMES = sorted(
240
+ [language_code_to_name[code] for code in text_source_language_codes]
241
+ )
242
+
243
+ # Target langs:
244
+ # S2ST / T2ST
245
+ s2st_target_language_codes = [
246
+ "eng",
247
+ "arb",
248
+ "ben",
249
+ "cat",
250
+ "ces",
251
+ "cmn",
252
+ "cym",
253
+ "dan",
254
+ "deu",
255
+ "est",
256
+ "fin",
257
+ "fra",
258
+ "hin",
259
+ "ind",
260
+ "ita",
261
+ "jpn",
262
+ "kor",
263
+ "mlt",
264
+ "nld",
265
+ "pes",
266
+ "pol",
267
+ "por",
268
+ "ron",
269
+ "rus",
270
+ "slk",
271
+ "spa",
272
+ "swe",
273
+ "swh",
274
+ "tel",
275
+ "tgl",
276
+ "tha",
277
+ "tur",
278
+ "ukr",
279
+ "urd",
280
+ "uzn",
281
+ "vie",
282
+ ]
283
+ S2ST_TARGET_LANGUAGE_NAMES = sorted(
284
+ [language_code_to_name[code] for code in s2st_target_language_codes]
285
+ )
286
+ # S2TT / ASR
287
+ S2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
288
+ # T2TT
289
+ T2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
290
+
291
+ # Download sample input audio files
292
+ filenames = ["assets/sample_input.mp3", "assets/sample_input_2.mp3"]
293
+ for filename in filenames:
294
+ hf_hub_download(
295
+ repo_id="facebook/seamless_m4t",
296
+ repo_type="space",
297
+ filename=filename,
298
+ local_dir=".",
299
+ )
300
+
301
+ AUDIO_SAMPLE_RATE = 16000.0
302
+ MAX_INPUT_AUDIO_LENGTH = 60 # in seconds
303
+ DEFAULT_TARGET_LANGUAGE = "French"
304
+
305
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
306
+ translator = Translator(
307
+ model_name_or_card="seamlessM4T_large",
308
+ vocoder_name_or_card="vocoder_36langs",
309
+ device=device,
310
+ dtype=torch.float16 if "cuda" in device.type else torch.float32,
311
+ )
312
+
313
+
314
+ def predict(
315
+ task_name: str,
316
+ audio_source: str,
317
+ input_audio_mic: str | None,
318
+ input_audio_file: str | None,
319
+ input_text: str | None,
320
+ source_language: str | None,
321
+ target_language: str,
322
+ ) -> tuple[tuple[int, np.ndarray] | None, str]:
323
+ task_name = task_name.split()[0]
324
+ source_language_code = (
325
+ LANGUAGE_NAME_TO_CODE[source_language] if source_language else None
326
+ )
327
+ target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
328
+
329
+ if task_name in ["S2ST", "S2TT", "ASR"]:
330
+ if audio_source == "microphone":
331
+ input_data = input_audio_mic
332
+ else:
333
+ input_data = input_audio_file
334
+
335
+ arr, org_sr = torchaudio.load(input_data)
336
+ new_arr = torchaudio.functional.resample(
337
+ arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE
338
+ )
339
+ max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE)
340
+ if new_arr.shape[1] > max_length:
341
+ new_arr = new_arr[:, :max_length]
342
+ gr.Warning(
343
+ f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used."
344
+ )
345
+ torchaudio.save(input_data, new_arr, sample_rate=int(AUDIO_SAMPLE_RATE))
346
+ else:
347
+ input_data = input_text
348
+ text_out, wav, sr = translator.predict(
349
+ input=input_data,
350
+ task_str=task_name,
351
+ tgt_lang=target_language_code,
352
+ src_lang=source_language_code,
353
+ ngram_filtering=True,
354
+ )
355
+ if task_name in ["S2ST", "T2ST"]:
356
+ return (sr, wav.cpu().detach().numpy()), text_out
357
+ else:
358
+ return None, text_out
359
+
360
+
361
+ def process_s2st_example(
362
+ input_audio_file: str, target_language: str
363
+ ) -> tuple[tuple[int, np.ndarray] | None, str]:
364
+ return predict(
365
+ task_name="S2ST",
366
+ audio_source="file",
367
+ input_audio_mic=None,
368
+ input_audio_file=input_audio_file,
369
+ input_text=None,
370
+ source_language=None,
371
+ target_language=target_language,
372
+ )
373
+
374
+
375
+ def process_s2tt_example(
376
+ input_audio_file: str, target_language: str
377
+ ) -> tuple[tuple[int, np.ndarray] | None, str]:
378
+ return predict(
379
+ task_name="S2TT",
380
+ audio_source="file",
381
+ input_audio_mic=None,
382
+ input_audio_file=input_audio_file,
383
+ input_text=None,
384
+ source_language=None,
385
+ target_language=target_language,
386
+ )
387
+
388
+
389
+ def process_t2st_example(
390
+ input_text: str, source_language: str, target_language: str
391
+ ) -> tuple[tuple[int, np.ndarray] | None, str]:
392
+ return predict(
393
+ task_name="T2ST",
394
+ audio_source="",
395
+ input_audio_mic=None,
396
+ input_audio_file=None,
397
+ input_text=input_text,
398
+ source_language=source_language,
399
+ target_language=target_language,
400
+ )
401
+
402
+
403
+ def process_t2tt_example(
404
+ input_text: str, source_language: str, target_language: str
405
+ ) -> tuple[tuple[int, np.ndarray] | None, str]:
406
+ return predict(
407
+ task_name="T2TT",
408
+ audio_source="",
409
+ input_audio_mic=None,
410
+ input_audio_file=None,
411
+ input_text=input_text,
412
+ source_language=source_language,
413
+ target_language=target_language,
414
+ )
415
+
416
+
417
+ def process_asr_example(
418
+ input_audio_file: str, target_language: str
419
+ ) -> tuple[tuple[int, np.ndarray] | None, str]:
420
+ return predict(
421
+ task_name="ASR",
422
+ audio_source="file",
423
+ input_audio_mic=None,
424
+ input_audio_file=input_audio_file,
425
+ input_text=None,
426
+ source_language=None,
427
+ target_language=target_language,
428
+ )
429
+
430
+
431
+ def update_audio_ui(audio_source: str) -> tuple[dict, dict]:
432
+ mic = audio_source == "microphone"
433
+ return (
434
+ gr.update(visible=mic, value=None), # input_audio_mic
435
+ gr.update(visible=not mic, value=None), # input_audio_file
436
+ )
437
+
438
+
439
+ def update_input_ui(task_name: str) -> tuple[dict, dict, dict, dict]:
440
+ task_name = task_name.split()[0]
441
+ if task_name == "S2ST":
442
+ return (
443
+ gr.update(visible=True), # audio_box
444
+ gr.update(visible=False), # input_text
445
+ gr.update(visible=False), # source_language
446
+ gr.update(
447
+ visible=True,
448
+ choices=S2ST_TARGET_LANGUAGE_NAMES,
449
+ value=DEFAULT_TARGET_LANGUAGE,
450
+ ), # target_language
451
+ )
452
+ elif task_name == "S2TT":
453
+ return (
454
+ gr.update(visible=True), # audio_box
455
+ gr.update(visible=False), # input_text
456
+ gr.update(visible=False), # source_language
457
+ gr.update(
458
+ visible=True,
459
+ choices=S2TT_TARGET_LANGUAGE_NAMES,
460
+ value=DEFAULT_TARGET_LANGUAGE,
461
+ ), # target_language
462
+ )
463
+ elif task_name == "T2ST":
464
+ return (
465
+ gr.update(visible=False), # audio_box
466
+ gr.update(visible=True), # input_text
467
+ gr.update(visible=True), # source_language
468
+ gr.update(
469
+ visible=True,
470
+ choices=S2ST_TARGET_LANGUAGE_NAMES,
471
+ value=DEFAULT_TARGET_LANGUAGE,
472
+ ), # target_language
473
+ )
474
+ elif task_name == "T2TT":
475
+ return (
476
+ gr.update(visible=False), # audio_box
477
+ gr.update(visible=True), # input_text
478
+ gr.update(visible=True), # source_language
479
+ gr.update(
480
+ visible=True,
481
+ choices=T2TT_TARGET_LANGUAGE_NAMES,
482
+ value=DEFAULT_TARGET_LANGUAGE,
483
+ ), # target_language
484
+ )
485
+ elif task_name == "ASR":
486
+ return (
487
+ gr.update(visible=True), # audio_box
488
+ gr.update(visible=False), # input_text
489
+ gr.update(visible=False), # source_language
490
+ gr.update(
491
+ visible=True,
492
+ choices=S2TT_TARGET_LANGUAGE_NAMES,
493
+ value=DEFAULT_TARGET_LANGUAGE,
494
+ ), # target_language
495
+ )
496
+ else:
497
+ raise ValueError(f"Unknown task: {task_name}")
498
+
499
+
500
+ def update_output_ui(task_name: str) -> tuple[dict, dict]:
501
+ task_name = task_name.split()[0]
502
+ if task_name in ["S2ST", "T2ST"]:
503
+ return (
504
+ gr.update(visible=True, value=None), # output_audio
505
+ gr.update(value=None), # output_text
506
+ )
507
+ elif task_name in ["S2TT", "T2TT", "ASR"]:
508
+ return (
509
+ gr.update(visible=False, value=None), # output_audio
510
+ gr.update(value=None), # output_text
511
+ )
512
+ else:
513
+ raise ValueError(f"Unknown task: {task_name}")
514
+
515
+
516
+ def update_example_ui(task_name: str) -> tuple[dict, dict, dict, dict, dict]:
517
+ task_name = task_name.split()[0]
518
+ return (
519
+ gr.update(visible=task_name == "S2ST"), # s2st_example_row
520
+ gr.update(visible=task_name == "S2TT"), # s2tt_example_row
521
+ gr.update(visible=task_name == "T2ST"), # t2st_example_row
522
+ gr.update(visible=task_name == "T2TT"), # t2tt_example_row
523
+ gr.update(visible=task_name == "ASR"), # asr_example_row
524
+ )
525
+
526
+
527
+ css = """
528
+ h1 {
529
+ text-align: center;
530
+ }
531
+
532
+ .contain {
533
+ max-width: 730px;
534
+ margin: auto;
535
+ padding-top: 1.5rem;
536
+ }
537
+ """
538
+
539
+ with gr.Blocks(css=css) as demo:
540
+ gr.Markdown(DESCRIPTION)
541
+ with gr.Group():
542
+ task_name = gr.Dropdown(
543
+ label="Task",
544
+ choices=TASK_NAMES,
545
+ value=TASK_NAMES[0],
546
+ )
547
+ with gr.Row():
548
+ source_language = gr.Dropdown(
549
+ label="Source language",
550
+ choices=TEXT_SOURCE_LANGUAGE_NAMES,
551
+ value="English",
552
+ visible=False,
553
+ )
554
+ target_language = gr.Dropdown(
555
+ label="Target language",
556
+ choices=S2ST_TARGET_LANGUAGE_NAMES,
557
+ value=DEFAULT_TARGET_LANGUAGE,
558
+ )
559
+ with gr.Row() as audio_box:
560
+ audio_source = gr.Radio(
561
+ label="Audio source",
562
+ choices=["file", "microphone"],
563
+ value="file",
564
+ )
565
+ input_audio_mic = gr.Audio(
566
+ label="Input speech",
567
+ type="filepath",
568
+ source="microphone",
569
+ visible=False,
570
+ )
571
+ input_audio_file = gr.Audio(
572
+ label="Input speech",
573
+ type="filepath",
574
+ source="upload",
575
+ visible=True,
576
+ )
577
+ input_text = gr.Textbox(label="Input text", visible=False)
578
+ with gr.Row():
579
+ btn = gr.Button("Translate")
580
+ btn_clean = gr.ClearButton([input_audio_mic, input_audio_file])
581
+ # gr.Markdown("## Text Examples")
582
+ with gr.Column():
583
+ output_audio = gr.Audio(
584
+ label="Translated speech",
585
+ autoplay=False,
586
+ streaming=False,
587
+ type="numpy",
588
+ )
589
+ output_text = gr.Textbox(label="Translated text")
590
+
591
+ with gr.Row(visible=True) as s2st_example_row:
592
+ s2st_examples = gr.Examples(
593
+ examples=[
594
+ ["assets/sample_input.mp3", "French"],
595
+ ["assets/sample_input.mp3", "Mandarin Chinese"],
596
+ ["assets/sample_input_2.mp3", "Hindi"],
597
+ ["assets/sample_input_2.mp3", "Spanish"],
598
+ ],
599
+ inputs=[input_audio_file, target_language],
600
+ outputs=[output_audio, output_text],
601
+ fn=process_s2st_example,
602
+ )
603
+ with gr.Row(visible=False) as s2tt_example_row:
604
+ s2tt_examples = gr.Examples(
605
+ examples=[
606
+ ["assets/sample_input.mp3", "French"],
607
+ ["assets/sample_input.mp3", "Mandarin Chinese"],
608
+ ["assets/sample_input_2.mp3", "Hindi"],
609
+ ["assets/sample_input_2.mp3", "Spanish"],
610
+ ],
611
+ inputs=[input_audio_file, target_language],
612
+ outputs=[output_audio, output_text],
613
+ fn=process_s2tt_example,
614
+ )
615
+ with gr.Row(visible=False) as t2st_example_row:
616
+ t2st_examples = gr.Examples(
617
+ examples=[
618
+ ["My favorite animal is the elephant.", "English", "French"],
619
+ ["My favorite animal is the elephant.", "English", "Mandarin Chinese"],
620
+ [
621
+ "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
622
+ "English",
623
+ "Hindi",
624
+ ],
625
+ [
626
+ "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
627
+ "English",
628
+ "Spanish",
629
+ ],
630
+ ],
631
+ inputs=[input_text, source_language, target_language],
632
+ outputs=[output_audio, output_text],
633
+ fn=process_t2st_example,
634
+ )
635
+ with gr.Row(visible=False) as t2tt_example_row:
636
+ t2tt_examples = gr.Examples(
637
+ examples=[
638
+ ["My favorite animal is the elephant.", "English", "French"],
639
+ ["My favorite animal is the elephant.", "English", "Mandarin Chinese"],
640
+ [
641
+ "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
642
+ "English",
643
+ "Hindi",
644
+ ],
645
+ [
646
+ "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
647
+ "English",
648
+ "Spanish",
649
+ ],
650
+ ],
651
+ inputs=[input_text, source_language, target_language],
652
+ outputs=[output_audio, output_text],
653
+ fn=process_t2tt_example,
654
+ )
655
+ with gr.Row(visible=False) as asr_example_row:
656
+ asr_examples = gr.Examples(
657
+ examples=[
658
+ ["assets/sample_input.mp3", "English"],
659
+ ["assets/sample_input_2.mp3", "English"],
660
+ ],
661
+ inputs=[input_audio_file, target_language],
662
+ outputs=[output_audio, output_text],
663
+ fn=process_asr_example,
664
+ )
665
+
666
+ audio_source.change(
667
+ fn=update_audio_ui,
668
+ inputs=audio_source,
669
+ outputs=[
670
+ input_audio_mic,
671
+ input_audio_file,
672
+ ],
673
+ queue=False,
674
+ api_name=False,
675
+ )
676
+ task_name.change(
677
+ fn=update_input_ui,
678
+ inputs=task_name,
679
+ outputs=[
680
+ audio_box,
681
+ input_text,
682
+ source_language,
683
+ target_language,
684
+ ],
685
+ queue=False,
686
+ api_name=False,
687
+ ).then(
688
+ fn=update_output_ui,
689
+ inputs=task_name,
690
+ outputs=[output_audio, output_text],
691
+ queue=False,
692
+ api_name=False,
693
+ ).then(
694
+ fn=update_example_ui,
695
+ inputs=task_name,
696
+ outputs=[
697
+ s2st_example_row,
698
+ s2tt_example_row,
699
+ t2st_example_row,
700
+ t2tt_example_row,
701
+ asr_example_row,
702
+ ],
703
+ queue=False,
704
+ api_name=False,
705
+ )
706
+
707
+ btn.click(
708
+ fn=predict,
709
+ inputs=[
710
+ task_name,
711
+ audio_source,
712
+ input_audio_mic,
713
+ input_audio_file,
714
+ input_text,
715
+ source_language,
716
+ target_language,
717
+ ],
718
+ outputs=[output_audio, output_text],
719
+ api_name="run",
720
+ )
721
+
722
+ if __name__ == "__main__":
723
+ demo.queue().launch()
assets/sample_input.mp3 ADDED
Binary file (10.3 kB). View file
 
assets/sample_input_2.mp3 ADDED
Binary file (30.6 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fairseq2
2
+ git+https://github.com/facebookresearch/seamless_communication
3
+ gradio
4
+ huggingface_hub
5
+ torch
6
+ torchaudio