Spaces:

datnth1709
/

FantasticFour-S2T-MT-demo

Build error

App Files Files Community

datnth1709 commited on Sep 21, 2022

Commit

85cd50e

1 Parent(s): 765e08f

revert

Browse files

Files changed (2) hide show

app.py +46 -107
app_old.py +0 -362

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import gradio as gr
 import nltk
 import librosa
-import soundfile as sf
 from transformers import pipeline
 from transformers.file_utils import cached_path, hf_bucket_url
 import os, zipfile
@@ -80,7 +79,9 @@ def speech2text_vi(audio):
 """English speech2text"""
 nltk.download("punkt")
 # Loading the model and the tokenizer
-eng_asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h")
 def load_data(input_file):
     """ Function for resampling to ensure that the speech input is sampled at 16KHz.
@@ -93,7 +94,7 @@ def load_data(input_file):
     # Resampling at 16KHz since wav2vec2-base-960h is pretrained and fine-tuned on speech audio sampled at 16 KHz.
     if sample_rate != 16000:
         speech = librosa.resample(speech, sample_rate, 16000)
-    return speech, sample_rate
 def correct_casing(input_sentence):
     """ This function is for correcting the casing of the generated transcribed text
@@ -105,10 +106,18 @@ def correct_casing(input_sentence):
 def speech2text_en(input_file):
     """This function generates transcripts for the provided audio input
     """
-    speech, samplerate = load_data(input_file)
     # Tokenize
-    text = eng_asr(speech)["text"]
-    return text
 """Machine translation"""
@@ -138,33 +147,6 @@ def inference_envi(audio):
     return en_text, vi_text
 def transcribe_vi(audio, state_vi="", state_en=""):
-    ds = speech_file_to_array_fn(audio.name)
-    # infer model
-    input_values = processor(
-          ds["speech"],
-          sampling_rate=ds["sampling_rate"],
-          return_tensors="pt"
-    ).input_values
-    # decode ctc output
-    logits = vi_model(input_values).logits[0]
-    pred_ids = torch.argmax(logits, dim=-1)
-    greedy_search_output = processor.decode(pred_ids)
-    beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
-    state_vi += beam_search_output + " "
-    en_text = translate_vi2en(beam_search_output)
-    state_en += en_text + " "
-    return state_vi, state_en
-def transcribe_en(audio, state_en="", state_vi=""):
-    speech, samplerate = load_data(audio)
-    # Tokenize
-    transcription = eng_asr(speech)["text"]
-    state_en += transcription + " "
-    vi_text = translate_en2vi(transcription)
-    state_vi += vi_text + " "
-    return state_en, state_vi
-def transcribe_vi_rm(audio, state_vi="", state_en=""):
     ds = speech_file_to_array_fn(audio.name)
     # infer model
     input_values = processor(
@@ -182,41 +164,23 @@ def transcribe_vi_rm(audio, state_vi="", state_en=""):
     state_en += en_text + " "
     return state_vi, state_en, state_vi, state_en
-def transcribe_en_rm(audio, state_en="", state_vi=""):
-    speech, samplerate = load_data(audio)
     # Tokenize
-    transcription = eng_asr(speech)["text"]
     state_en += transcription + " "
     vi_text = translate_en2vi(transcription)
     state_vi += vi_text + " "
     return state_en, state_vi, state_en, state_vi
-def transcribe_vi_rd(audio, state=""):
-    ds = speech_file_to_array_fn(audio.name)
-    # infer model
-    input_values = processor(
-          ds["speech"],
-          sampling_rate=ds["sampling_rate"],
-          return_tensors="pt"
-    ).input_values
-    # decode ctc output
-    logits = vi_model(input_values).logits[0]
-    pred_ids = torch.argmax(logits, dim=-1)
-    greedy_search_output = processor.decode(pred_ids)
-    beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
-    en_text = translate_vi2en(beam_search_output)
-    state += en_text + " "
-    return state, state
-def transcribe_en_rd(audio, state=""):
-    speech, samplerate = load_data(audio)
-    # Tokenize
-    transcription = eng_asr(speech)["text"]
-    transcription = correct_casing(transcription.lower())
-    vi_text = translate_en2vi(transcription)
-    state += vi_text + " "
-    return state, state
 """Gradio demo"""
 vi_example_text = ["Có phải bạn đang muốn tìm mua nhà ở ngoại ô thành phố Hồ Chí Minh không?",
@@ -243,39 +207,27 @@ with gr.Blocks() as demo:
             translate_button_vien_1.click(lambda text: translate_vi2en(text), inputs=vietnamese_text, outputs=english_out_1)
             gr.Examples(examples=vi_example_text,
                         inputs=[vietnamese_text])
         with gr.TabItem("Speech2text and Vi-En Translation"):
             with gr.Row():
                 with gr.Column():
-                    vi_audio_1 = gr.Audio(source="microphone", label="Input Vietnamese Audio", type="file", streaming=False)
                     translate_button_vien_2 = gr.Button(value="Translate To English")
                 with gr.Column():
                     speech2text_vi1 = gr.Textbox(label="Vietnamese Text")
                     english_out_2 = gr.Textbox(label="English Text")
-            translate_button_vien_2.click(lambda vi_voice: inference_vien(vi_voice), inputs=vi_audio_1, outputs=[speech2text_vi1, english_out_2])
             gr.Examples(examples=vi_example_voice,
-                        inputs=[vi_audio_1])
         with gr.TabItem("Vi-En Realtime Translation"):
-            # with gr.Row():
-            #     with gr.Column():
-            #         vi_audio_2 = gr.Audio(source="microphone", label="Input Vietnamese Audio", type="file", streaming=True)
-            #     with gr.Column():
-            #         speech2text_vi2 = gr.Textbox(label="Vietnamese Text")
-            #         english_out_3 = gr.Textbox(label="English Text")
-            # vi_audio_2.change(transcribe_vi, [vi_audio_2, speech2text_vi2, english_out_3], [speech2text_vi2, english_out_3])
-            gr.Interface(
-                fn=transcribe_vi_rd,
-                inputs=[
-                    gr.Audio(source="microphone", type="file", streaming=True),
-                    "state"
-                ],
-                outputs=[
-                    "textbox",
-                    "state"
-                ],
-                live=True).launch()
     with gr.Tabs():
@@ -303,27 +255,14 @@ with gr.Blocks() as demo:
                         inputs=[en_audio_1])
         with gr.TabItem("En-Vi Realtime Translation"):
-            # with gr.Row():
-            #     with gr.Column():
-            #         en_audio_2 = gr.Audio(source="microphone", label="Input English Audio", type="filepath", streaming=True)
-            #     with gr.Column():
-            #         speech2text_en2 = gr.Textbox(label="English Text")
-            #         vietnamese_out_3 = gr.Textbox(label="Vietnamese Text")
-            # en_audio_2.change(transcribe_en, [en_audio_2, speech2text_en2, vietnamese_out_3], [speech2text_en2, vietnamese_out_3])
-            # speech2text_en2, vietnamese_out_3 = transcribe_en(en_audio_2, speech2text_en2, vietnamese_out_3)
-            gr.Interface(
-                fn=transcribe_en_rd,
-                inputs=[
-                    gr.Audio(source="microphone", type="filepath", streaming=True),
-                    "state"
-                ],
-                outputs=[
-                    "textbox",
-                    "state"
-                ],
-                live=True).launch()
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 import nltk
 import librosa
 from transformers import pipeline
 from transformers.file_utils import cached_path, hf_bucket_url
 import os, zipfile
 """English speech2text"""
 nltk.download("punkt")
 # Loading the model and the tokenizer
+model_name = "facebook/wav2vec2-base-960h"
+eng_tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
+eng_model = Wav2Vec2ForCTC.from_pretrained(model_name)
 def load_data(input_file):
     """ Function for resampling to ensure that the speech input is sampled at 16KHz.
     # Resampling at 16KHz since wav2vec2-base-960h is pretrained and fine-tuned on speech audio sampled at 16 KHz.
     if sample_rate != 16000:
         speech = librosa.resample(speech, sample_rate, 16000)
+    return speech
 def correct_casing(input_sentence):
     """ This function is for correcting the casing of the generated transcribed text
 def speech2text_en(input_file):
     """This function generates transcripts for the provided audio input
     """
+    speech = load_data(input_file)
     # Tokenize
+    input_values = eng_tokenizer(speech, return_tensors="pt").input_values
+    # Take logits
+    logits = eng_model(input_values).logits
+    # Take argmax
+    predicted_ids = torch.argmax(logits, dim=-1)
+    # Get the words from predicted word ids
+    transcription = eng_tokenizer.decode(predicted_ids[0])
+    # Output is all upper case
+    transcription = correct_casing(transcription.lower())
+    return transcription
 """Machine translation"""
     return en_text, vi_text
 def transcribe_vi(audio, state_vi="", state_en=""):
     ds = speech_file_to_array_fn(audio.name)
     # infer model
     input_values = processor(
     state_en += en_text + " "
     return state_vi, state_en, state_vi, state_en
+def transcribe_en(audio, state_en="", state_vi=""):
+    speech = load_data(audio)
     # Tokenize
+    input_values = eng_tokenizer(speech, return_tensors="pt").input_values
+    # Take logits
+    logits = eng_model(input_values).logits
+    # Take argmax
+    predicted_ids = torch.argmax(logits, dim=-1)
+    # Get the words from predicted word ids
+    transcription = eng_tokenizer.decode(predicted_ids[0])
+    # Output is all upper case
+    transcription = correct_casing(transcription.lower())
     state_en += transcription + " "
     vi_text = translate_en2vi(transcription)
     state_vi += vi_text + " "
     return state_en, state_vi, state_en, state_vi
 """Gradio demo"""
 vi_example_text = ["Có phải bạn đang muốn tìm mua nhà ở ngoại ô thành phố Hồ Chí Minh không?",
             translate_button_vien_1.click(lambda text: translate_vi2en(text), inputs=vietnamese_text, outputs=english_out_1)
             gr.Examples(examples=vi_example_text,
                         inputs=[vietnamese_text])
         with gr.TabItem("Speech2text and Vi-En Translation"):
             with gr.Row():
                 with gr.Column():
+                    vi_audio = gr.Audio(source="microphone", label="Input Vietnamese Audio", type="file", streaming=False)
                     translate_button_vien_2 = gr.Button(value="Translate To English")
                 with gr.Column():
                     speech2text_vi1 = gr.Textbox(label="Vietnamese Text")
                     english_out_2 = gr.Textbox(label="English Text")
+            translate_button_vien_2.click(lambda vi_voice: inference_vien(vi_voice), inputs=vi_audio, outputs=[speech2text_vi1, english_out_2])
             gr.Examples(examples=vi_example_voice,
+                        inputs=[vi_audio])
         with gr.TabItem("Vi-En Realtime Translation"):
+            with gr.Row():
+                with gr.Column():
+                    vi_audio = gr.Audio(source="microphone", label="Input Vietnamese Audio", type="file", streaming=True)
+                    translate_button_vien_2 = gr.Button(value="Translate To English")
+                with gr.Column():
+                    speech2text_vi2 = gr.Textbox(label="Vietnamese Text")
+                    english_out_3 = gr.Textbox(label="English Text")
+            vi_audio.change(transcribe_vi, [vi_audio, "state_vi", "state_en"], [speech2text_vi2, english_out_3, "state_vi", "state_en"])
     with gr.Tabs():
                         inputs=[en_audio_1])
         with gr.TabItem("En-Vi Realtime Translation"):
+            with gr.Row():
+                with gr.Column():
+                    en_audio_2 = gr.Audio(source="microphone", label="Input English Audio", type="filepath", streaming=True)
+                    # translate_button_envi_2 = gr.Button(value="Translate To Vietnamese")
+                with gr.Column():
+                    speech2text_en2 = gr.Textbox(label="English Text")
+                    vietnamese_out_3 = gr.Textbox(label="Vietnamese Text")
+            en_audio_2.change(transcribe_en, [en_audio_2, "state_en", "state_vi"], [speech2text_en2, vietnamese_out_3, "state_en", "state_vi"])
 if __name__ == "__main__":
     demo.launch()

app_old.py DELETED Viewed

@@ -1,362 +0,0 @@
-import gradio as gr
-import nltk
-import librosa
-import soundfile as sf
-from transformers import pipeline
-from transformers.file_utils import cached_path, hf_bucket_url
-import os, zipfile
-from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2Tokenizer
-from datasets import load_dataset
-import torch
-import kenlm
-import torchaudio
-from pyctcdecode import Alphabet, BeamSearchDecoderCTC, LanguageModel
-"""Vietnamese speech2text"""
-cache_dir = './cache/'
-processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
-vi_model = Wav2Vec2ForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
-lm_file = hf_bucket_url("nguyenvulebinh/wav2vec2-base-vietnamese-250h", filename='vi_lm_4grams.bin.zip')
-lm_file = cached_path(lm_file,cache_dir=cache_dir)
-with zipfile.ZipFile(lm_file, 'r') as zip_ref:
-    zip_ref.extractall(cache_dir)
-lm_file = cache_dir + 'vi_lm_4grams.bin'
-def get_decoder_ngram_model(tokenizer, ngram_lm_path):
-    vocab_dict = tokenizer.get_vocab()
-    sort_vocab = sorted((value, key) for (key, value) in vocab_dict.items())
-    vocab = [x[1] for x in sort_vocab][:-2]
-    vocab_list = vocab
-    # convert ctc blank character representation
-    vocab_list[tokenizer.pad_token_id] = ""
-    # replace special characters
-    vocab_list[tokenizer.unk_token_id] = ""
-    # vocab_list[tokenizer.bos_token_id] = ""
-    # vocab_list[tokenizer.eos_token_id] = ""
-    # convert space character representation
-    vocab_list[tokenizer.word_delimiter_token_id] = " "
-    # specify ctc blank char index, since conventially it is the last entry of the logit matrix
-    alphabet = Alphabet.build_alphabet(vocab_list, ctc_token_idx=tokenizer.pad_token_id)
-    lm_model = kenlm.Model(ngram_lm_path)
-    decoder = BeamSearchDecoderCTC(alphabet,
-                                   language_model=LanguageModel(lm_model))
-    return decoder
-ngram_lm_model = get_decoder_ngram_model(processor.tokenizer, lm_file)
-# define function to read in sound file
-def speech_file_to_array_fn(path, max_seconds=10):
-    batch = {"file": path}
-    speech_array, sampling_rate = torchaudio.load(batch["file"])
-    if sampling_rate != 16000:
-      transform = torchaudio.transforms.Resample(orig_freq=sampling_rate,
-                                                 new_freq=16000)
-      speech_array = transform(speech_array)
-    speech_array = speech_array[0]
-    if max_seconds > 0:
-      speech_array = speech_array[:max_seconds*16000]
-    batch["speech"] = speech_array.numpy()
-    batch["sampling_rate"] = 16000
-    return batch
-# tokenize
-def speech2text_vi(audio):
-   # read in sound file
-    # load dummy dataset and read soundfiles
-    ds = speech_file_to_array_fn(audio.name)
-    # infer model
-    input_values = processor(
-          ds["speech"],
-          sampling_rate=ds["sampling_rate"],
-          return_tensors="pt"
-    ).input_values
-    # decode ctc output
-    logits = vi_model(input_values).logits[0]
-    pred_ids = torch.argmax(logits, dim=-1)
-    greedy_search_output = processor.decode(pred_ids)
-    beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
-    return beam_search_output
-"""English speech2text"""
-nltk.download("punkt")
-# Loading the model and the tokenizer
-model_name = "facebook/wav2vec2-base-960h"
-eng_tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
-eng_model = Wav2Vec2ForCTC.from_pretrained(model_name)
-def load_data(input_file):
-    """ Function for resampling to ensure that the speech input is sampled at 16KHz.
-    """
-    # read the file
-    speech, sample_rate = librosa.load(input_file)
-    # make it 1-D
-    if len(speech.shape) > 1:
-        speech = speech[:, 0] + speech[:, 1]
-    # Resampling at 16KHz since wav2vec2-base-960h is pretrained and fine-tuned on speech audio sampled at 16 KHz.
-    if sample_rate != 16000:
-        speech = librosa.resample(speech, sample_rate, 16000)
-    return speech, sample_rate
-def correct_casing(input_sentence):
-    """ This function is for correcting the casing of the generated transcribed text
-    """
-    sentences = nltk.sent_tokenize(input_sentence)
-    return (' '.join([s.replace(s[0], s[0].capitalize(), 1) for s in sentences]))
-def speech2text_en(input_file):
-    """This function generates transcripts for the provided audio input
-    """
-    speech, samplerate = load_data(input_file)
-    # Tokenize
-    input_values = eng_tokenizer(speech, sampling_rate = samplerate, return_tensors="pt").input_values
-    # Take logits
-    logits = eng_model(input_values).logits
-    # Take argmax
-    predicted_ids = torch.argmax(logits, dim=-1)
-    # Get the words from predicted word ids
-    transcription = eng_tokenizer.decode(predicted_ids[0])
-    # Output is all upper case
-    transcription = correct_casing(transcription.lower())
-    return transcription
-"""Machine translation"""
-vien_model_checkpoint = "datnth1709/finetuned_HelsinkiNLP-opus-mt-vi-en_PhoMT"
-envi_model_checkpoint = "datnth1709/finetuned_HelsinkiNLP-opus-mt-en-vi_PhoMT"
-vien_translator = pipeline("translation", model=vien_model_checkpoint)
-envi_translator = pipeline("translation", model=envi_model_checkpoint)
-def translate_vi2en(Vietnamese):
-    return vien_translator(Vietnamese)[0]['translation_text']
-def translate_en2vi(English):
-    return envi_translator(English)[0]['translation_text']
-""" Inference"""
-def inference_vien(audio):
-    vi_text = speech2text_vi(audio)
-    en_text = translate_vi2en(vi_text)
-    return vi_text, en_text
-def inference_envi(audio):
-    en_text = speech2text_en(audio)
-    vi_text = translate_en2vi(en_text)
-    return en_text, vi_text
-def transcribe_vi(audio, state_vi="", state_en=""):
-    ds = speech_file_to_array_fn(audio.name)
-    # infer model
-    input_values = processor(
-          ds["speech"],
-          sampling_rate=ds["sampling_rate"],
-          return_tensors="pt"
-    ).input_values
-    # decode ctc output
-    logits = vi_model(input_values).logits[0]
-    pred_ids = torch.argmax(logits, dim=-1)
-    greedy_search_output = processor.decode(pred_ids)
-    beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
-    state_vi += beam_search_output + " "
-    en_text = translate_vi2en(beam_search_output)
-    state_en += en_text + " "
-    return state_vi, state_en
-def transcribe_en(audio, state_en="", state_vi=""):
-    speech, samplerate = load_data(audio)
-    # Tokenize
-    input_values = eng_tokenizer(speech, sampling_rate = samplerate, return_tensors="pt").input_values
-    # Take logits
-    logits = eng_model(input_values).logits
-    # Take argmax
-    predicted_ids = torch.argmax(logits, dim=-1)
-    # Get the words from predicted word ids
-    transcription = eng_tokenizer.decode(predicted_ids[0])
-    # Output is all upper case
-    transcription = correct_casing(transcription.lower())
-    state_en += transcription + " "
-    vi_text = translate_en2vi(transcription)
-    state_vi += vi_text + " "
-    return state_en, state_vi
-def transcribe_vi_rm(audio, state_vi="", state_en=""):
-    ds = speech_file_to_array_fn(audio.name)
-    # infer model
-    input_values = processor(
-          ds["speech"],
-          sampling_rate=ds["sampling_rate"],
-          return_tensors="pt"
-    ).input_values
-    # decode ctc output
-    logits = vi_model(input_values).logits[0]
-    pred_ids = torch.argmax(logits, dim=-1)
-    greedy_search_output = processor.decode(pred_ids)
-    beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
-    state_vi += beam_search_output + " "
-    en_text = translate_vi2en(beam_search_output)
-    state_en += en_text + " "
-    return state_vi, state_en, state_vi, state_en
-def transcribe_en_rm(audio, state_en="", state_vi=""):
-    speech, samplerate = load_data(audio)
-    # Tokenize
-    input_values = eng_tokenizer(speech, sampling_rate = samplerate, return_tensors="pt").input_values
-    # Take logits
-    logits = eng_model(input_values).logits
-    # Take argmax
-    predicted_ids = torch.argmax(logits, dim=-1)
-    # Get the words from predicted word ids
-    transcription = eng_tokenizer.decode(predicted_ids[0])
-    # Output is all upper case
-    transcription = correct_casing(transcription.lower())
-    state_en += transcription + " "
-    vi_text = translate_en2vi(transcription)
-    state_vi += vi_text + " "
-    return state_en, state_vi, state_en, state_vi
-def transcribe_vi_rd(audio, state=""):
-    ds = speech_file_to_array_fn(audio.name)
-    # infer model
-    input_values = processor(
-          ds["speech"],
-          sampling_rate=ds["sampling_rate"],
-          return_tensors="pt"
-    ).input_values
-    # decode ctc output
-    logits = vi_model(input_values).logits[0]
-    pred_ids = torch.argmax(logits, dim=-1)
-    greedy_search_output = processor.decode(pred_ids)
-    beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
-    en_text = translate_vi2en(beam_search_output)
-    state += en_text + " "
-    return state, state
-def transcribe_en_rd(audio, state=""):
-    speech, samplerate = load_data(audio)
-    # Tokenize
-    input_values = eng_tokenizer(speech, sampling_rate = samplerate, return_tensors="pt").input_values
-    # Take logits
-    logits = eng_model(input_values).logits
-    # Take argmax
-    predicted_ids = torch.argmax(logits, dim=-1)
-    # Get the words from predicted word ids
-    transcription = eng_tokenizer.decode(predicted_ids[0])
-    # Output is all upper case
-    transcription = correct_casing(transcription.lower())
-    vi_text = translate_en2vi(transcription)
-    state += vi_text + " "
-    return state, state
-"""Gradio demo"""
-vi_example_text = ["Có phải bạn đang muốn tìm mua nhà ở ngoại ô thành phố Hồ Chí Minh không?",
-                   "Ánh mắt ta chạm nhau. Chỉ muốn ngắm anh lâu thật lâu.",
-                   "Nếu như một câu nói có thể khiến em vui."]
-vi_example_voice =[['vi_speech_01.wav'], ['vi_speech_02.wav'], ['vi_speech_03.wav']]
-en_example_text = ["According to a study by Statista, the global AI market is set to grow up to 54 percent every single year.",
-                   "As one of the world's greatest cities, Air New Zealand is proud to add the Big Apple to its list of 29 international destinations.",
-                   "And yet, earlier this month, I found myself at Halloween Horror Nights at Universal Orlando Resort, one of the most popular Halloween events in the US among hardcore horror buffs."
-                   ]
-en_example_voice =[['en_speech_01.wav'], ['en_speech_02.wav'], ['en_speech_03.wav']]
-with gr.Blocks() as demo:
-    with gr.Tabs():
-        with gr.TabItem("Translation: Vietnamese to English"):
-            with gr.Row():
-                with gr.Column():
-                    vietnamese_text = gr.Textbox(label="Vietnamese Text")
-                    translate_button_vien_1 = gr.Button(value="Translate To English")
-                with gr.Column():
-                    english_out_1 = gr.Textbox(label="English Text")
-            translate_button_vien_1.click(lambda text: translate_vi2en(text), inputs=vietnamese_text, outputs=english_out_1)
-            gr.Examples(examples=vi_example_text,
-                        inputs=[vietnamese_text])
-        with gr.TabItem("Speech2text and Vi-En Translation"):
-            with gr.Row():
-                with gr.Column():
-                    vi_audio_1 = gr.Audio(source="microphone", label="Input Vietnamese Audio", type="file", streaming=False)
-                    translate_button_vien_2 = gr.Button(value="Translate To English")
-                with gr.Column():
-                    speech2text_vi1 = gr.Textbox(label="Vietnamese Text")
-                    english_out_2 = gr.Textbox(label="English Text")
-            translate_button_vien_2.click(lambda vi_voice: inference_vien(vi_voice), inputs=vi_audio_1, outputs=[speech2text_vi1, english_out_2])
-            gr.Examples(examples=vi_example_voice,
-                        inputs=[vi_audio_1])
-        with gr.TabItem("Vi-En Realtime Translation"):
-            # with gr.Row():
-            #     with gr.Column():
-            #         vi_audio_2 = gr.Audio(source="microphone", label="Input Vietnamese Audio", type="file", streaming=True)
-            #     with gr.Column():
-            #         speech2text_vi2 = gr.Textbox(label="Vietnamese Text")
-            #         english_out_3 = gr.Textbox(label="English Text")
-            # vi_audio_2.change(transcribe_vi, [vi_audio_2, speech2text_vi2, english_out_3], [speech2text_vi2, english_out_3])
-            gr.Interface(
-                fn=transcribe_vi_rd,
-                inputs=[
-                    gr.Audio(source="microphone", type="file", streaming=True),
-                    "state"
-                ],
-                outputs=[
-                    "textbox",
-                    "state"
-                ],
-                live=True).launch()
-    with gr.Tabs():
-        with gr.TabItem("Translation: English to Vietnamese"):
-            with gr.Row():
-                with gr.Column():
-                    english_text = gr.Textbox(label="English Text")
-                    translate_button_envi_1 = gr.Button(value="Translate To Vietnamese")
-                with gr.Column():
-                    vietnamese_out_1 = gr.Textbox(label="Vietnamese Text")
-            translate_button_envi_1.click(lambda text: translate_en2vi(text), inputs=english_text, outputs=vietnamese_out_1)
-            gr.Examples(examples=en_example_text,
-                        inputs=[english_text])
-        with gr.TabItem("Speech2text and En-Vi Translation"):
-            with gr.Row():
-                with gr.Column():
-                    en_audio_1 = gr.Audio(source="microphone", label="Input English Audio", type="filepath", streaming=False)
-                    translate_button_envi_2 = gr.Button(value="Translate To Vietnamese")
-                with gr.Column():
-                    speech2text_en1 = gr.Textbox(label="English Text")
-                    vietnamese_out_2 = gr.Textbox(label="Vietnamese Text")
-            translate_button_envi_2.click(lambda en_voice: inference_envi(en_voice), inputs=en_audio_1, outputs=[speech2text_en1, vietnamese_out_2])
-            gr.Examples(examples=en_example_voice,
-                        inputs=[en_audio_1])
-        with gr.TabItem("En-Vi Realtime Translation"):
-            # with gr.Row():
-            #     with gr.Column():
-            #         en_audio_2 = gr.Audio(source="microphone", label="Input English Audio", type="filepath", streaming=True)
-            #     with gr.Column():
-            #         speech2text_en2 = gr.Textbox(label="English Text")
-            #         vietnamese_out_3 = gr.Textbox(label="Vietnamese Text")
-            # en_audio_2.change(transcribe_en, [en_audio_2, speech2text_en2, vietnamese_out_3], [speech2text_en2, vietnamese_out_3])
-            # speech2text_en2, vietnamese_out_3 = transcribe_en(en_audio_2, speech2text_en2, vietnamese_out_3)
-            gr.Interface(
-                fn=transcribe_en_rd,
-                inputs=[
-                    gr.Audio(source="microphone", type="filepath", streaming=True),
-                    "state"
-                ],
-                outputs=[
-                    "textbox",
-                    "state"
-                ],
-                live=True).launch()
-if __name__ == "__main__":
-    demo.launch()