Spaces:

owaski-demo
/

Demo

Sleeping

App Files Files Community

owaski commited on Feb 27, 2023

Commit

566d6f4

1 Parent(s): b90393d

new dependency

Browse files

Files changed (2) hide show

app.py +31 -3
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -13,7 +13,6 @@ language_id_lookup = {
             "Arabic"    : "ar",
             "English"   : "en",
             "Chinese"   : "zh",
-            "German"    : "de",
             "Spanish"   : "es",
             "Russian"   : "ru",
             "French"    : "fr",
@@ -33,6 +32,18 @@ os.system("wget https://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/acl202
 os.system("wget https://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/emnlp2020/mrasp/pretrain/dataset/codes.bpe.32000")
 # The predict function. audio, language and mic_audio are all parameters directly passed by gradio
 # which means they are user inputted. They are specified in gr.inputs[] block at the bottom. The
 # gr.outputs[] block will specify the output type.
@@ -64,12 +75,15 @@ def predict(audio, src_language, tgt_language, mic_audio=None):
     # Runs the audio through the whisper model and gets the DecodingResult object, which has the features:
     # audio_features (Tensor), language, language_probs, tokens, text, avg_logprob, no_speech_prob, temperature, compression_ratio
-    options = whisper.DecodingOptions(fp16 = False, language = src_language)
     result = whisper.decode(model, mel, options)
     if src_language is None:
         src_language = result.language
     transcript = result.text
     with open("input." + src_language, 'w') as w:
         w.write(result.text)
     with open("input." + tgt_language, 'w') as w:
@@ -100,8 +114,21 @@ def predict(audio, src_language, tgt_language, mic_audio=None):
     with open("output", 'r') as r:
         translation = (' '.join(r.readline().split(' ')[3:])).strip()
     # Returns the text
-    return transcript, translation
@@ -139,6 +166,7 @@ gr.Interface(
     outputs=[
         gr.Text(label="Transcript"),
         gr.Text(label="Translation"),
     ],
     title=title,
     description=description,

             "Arabic"    : "ar",
             "English"   : "en",
             "Chinese"   : "zh",
             "Spanish"   : "es",
             "Russian"   : "ru",
             "French"    : "fr",
 os.system("wget https://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/emnlp2020/mrasp/pretrain/dataset/codes.bpe.32000")
+# load tts
+os.system("apt-get install espeak -y")
+tts_model_name = {
+    "ar": "facebook/tts_transformer-ar-cv7",
+    "en": "facebook/tts_transformer-en-200_speaker-cv4",
+    "zh": "facebook/tts_transformer-zh-cv7_css10",
+    "es": "facebook/tts_transformer-es-css10",
+    "ru": "facebook/tts_transformer-ru-cv7_css10",
+    "fr": "facebook/tts_transformer-fr-cv7_css10"
+}
 # The predict function. audio, language and mic_audio are all parameters directly passed by gradio
 # which means they are user inputted. They are specified in gr.inputs[] block at the bottom. The
 # gr.outputs[] block will specify the output type.
     # Runs the audio through the whisper model and gets the DecodingResult object, which has the features:
     # audio_features (Tensor), language, language_probs, tokens, text, avg_logprob, no_speech_prob, temperature, compression_ratio
+    # asr
+    options = whisper.DecodingOptions(fp16 = True, language = src_language)
     result = whisper.decode(model, mel, options)
     if src_language is None:
         src_language = result.language
     transcript = result.text
+    # mt
     with open("input." + src_language, 'w') as w:
         w.write(result.text)
     with open("input." + tgt_language, 'w') as w:
     with open("output", 'r') as r:
         translation = (' '.join(r.readline().split(' ')[3:])).strip()
+    # tts
+    from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
+    from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
+    tts_models, tts_cfg, tts_task = load_model_ensemble_and_task_from_hf_hub(
+        tts_model_name[tgt_language],
+        arg_overrides={"vocoder": "hifigan", "fp16": True}
+    )
+    tts_model = tts_models[0]
+    TTSHubInterface.update_cfg_with_data_cfg(tts_cfg, tts_task.data_cfg)
+    tts_generator = tts_task.build_generator(tts_model, tts_cfg)
+    tts_sample = TTSHubInterface.get_model_input(tts_task, translation)
+    wav, rate = TTSHubInterface.get_prediction(tts_task, tts_model, tts_generator, tts_sample)
     # Returns the text
+    return transcript, translation, wav
     outputs=[
         gr.Text(label="Transcript"),
         gr.Text(label="Translation"),
+        gr.outputs.Audio(type="numpy", label="Translation Speech")
     ],
     title=title,
     description=description,

requirements.txt CHANGED Viewed

@@ -6,4 +6,5 @@ sacrebleu
 sacremoses
 kytea
 six
-TTS

 sacremoses
 kytea
 six
+phonemizer
+sentencepiece