Spaces:

owaski-demo
/

Demo

Sleeping

App Files Files Community

owaski commited on Feb 27, 2023

Commit

ce0d186

1 Parent(s): 8d8f4e3

use Toucan tts

Browse files

Files changed (1) hide show

app.py +20 -42

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # imports
 import os
 os.system("pip install git+https://github.com/openai/whisper.git")
 import gradio as gr
 import whisper
@@ -35,22 +36,23 @@ os.system("wget https://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/emnlp2
 # load tts
 os.system("apt-get install espeak -y")
-tts_model_name = {
-    "ar": "facebook/tts_transformer-ar-cv7",
-    "en": "facebook/tts_transformer-en-ljspeech",
-    "zh": "facebook/tts_transformer-zh-cv7_css10",
-    "es": "facebook/tts_transformer-es-css10",
-    "ru": "facebook/tts_transformer-ru-cv7_css10",
-    "fr": "facebook/tts_transformer-fr-cv7_css10"
-}
-os.system("git clone https://github.com/Kyubyong/g2pC.git")
-os.system("cd g2pC; sed -i 's/pkuseg/spacy_pkuseg/g' setup.py; \
-          sed -i 's/import pkuseg/import spacy_pkuseg as pkuseg/g' g2pc/g2pc.py; \
-          sed -i 's/package_data={/# package_data={/g' setup.py; \
-          pip install ./; cd ..")
 # The predict function. audio, language and mic_audio are all parameters directly passed by gradio
 # which means they are user inputted. They are specified in gr.inputs[] block at the bottom. The
@@ -123,35 +125,11 @@ def predict(audio, src_language, tgt_language, mic_audio=None):
         translation = (' '.join(r.readline().split(' ')[3:])).strip()
     # tts
-    from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
-    from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
-    tts_models, tts_cfg, tts_task = load_model_ensemble_and_task_from_hf_hub(
-        tts_model_name[tgt_language],
-        arg_overrides={"vocoder": "hifigan", "fp16": False}
-    )
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    tts_model = tts_models[0].to(device)
-    TTSHubInterface.update_cfg_with_data_cfg(tts_cfg, tts_task.data_cfg)
-    tts_generator = tts_task.build_generator(tts_models, tts_cfg)
-    tts_sample = TTSHubInterface.get_model_input(tts_task, translation)
-    tts_sample = {
-        'net_input': {
-            'src_tokens': tts_sample['net_input']['src_tokens'].to(device),
-            'src_lengths': tts_sample['net_input']['src_lengths'].to(device),
-            'prev_output_tokens': None,
-        },
-        'target_lengths': None,
-        'speaker': tts_sample['speaker'].to(device) if tts_sample['speaker'] is not None else None,
-    }
-    wav, rate = TTSHubInterface.get_prediction(tts_task, tts_model, tts_generator, tts_sample)
-    wav = wav.cpu().numpy()
     # Returns the text
-    return transcript, translation, (rate, wav)
@@ -175,8 +153,8 @@ gr.Interface(
                             'Russian',
                             'French',
                             'Detect Language'], type="value", default='English', label="Select the language of input"),
-        gr.inputs.Dropdown(['Arabic',
-                            'Chinese',
                             'English',
                             'Spanish',
                             'Russian',

 # imports
 import os
+import sys
 os.system("pip install git+https://github.com/openai/whisper.git")
 import gradio as gr
 import whisper
 # load tts
 os.system("apt-get install espeak -y")
+# os.system("git clone https://github.com/Kyubyong/g2pC.git")
+# os.system("cd g2pC; sed -i 's/pkuseg/spacy_pkuseg/g' setup.py; \
+#           sed -i 's/import pkuseg/import spacy_pkuseg as pkuseg/g' g2pc/g2pc.py; \
+#           sed -i 's/package_data={/# package_data={/g' setup.py; \
+#           pip install ./; cd ..")
+os.system("git clone https://github.com/DigitalPhonetics/IMS-Toucan.git")
+sys.path.append('./IMS-Toucan')
+os.system("cd IMS-Toucan; pip install --no-cache-dir -r requirements.txt")
+os.system("python run_model_downloader.py; cd ..")
+from InferenceInterfaces.PortaSpeechInterface import PortaSpeechInterface
+cwd = os.getcwd()
+os.chdir('./IMS-Toucan')
+tts = PortaSpeechInterface(device='cpu', tts_model_path='Meta')
+os.chdir(cwd)
 # The predict function. audio, language and mic_audio are all parameters directly passed by gradio
 # which means they are user inputted. They are specified in gr.inputs[] block at the bottom. The
         translation = (' '.join(r.readline().split(' ')[3:])).strip()
     # tts
+    tts.set_language(tgt_language)
+    tts.read_to_file(text_list=[translation], file_location='output.wav')
     # Returns the text
+    return transcript, translation, 'output.wav'
                             'Russian',
                             'French',
                             'Detect Language'], type="value", default='English', label="Select the language of input"),
+        gr.inputs.Dropdown([# 'Arabic',
+                            # 'Chinese',
                             'English',
                             'Spanish',
                             'Russian',