Spaces:

owaski-demo
/

Demo

Sleeping

App Files Files Community

owaski commited on Feb 27, 2023

Commit

87d6e1d

1 Parent(s): 566d6f4

add tts

Browse files

Files changed (2) hide show

app.py +28 -5
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import whisper
 # the model we are using for ASR, options are small, medium, large and largev2 (large and largev2 don't fit on huggingface cpu)
 model = whisper.load_model("small")
 # A table to look up all the languages
 language_id_lookup = {
@@ -43,6 +44,13 @@ tts_model_name = {
     "fr": "facebook/tts_transformer-fr-cv7_css10"
 }
 # The predict function. audio, language and mic_audio are all parameters directly passed by gradio
 # which means they are user inputted. They are specified in gr.inputs[] block at the bottom. The
@@ -119,16 +127,31 @@ def predict(audio, src_language, tgt_language, mic_audio=None):
     from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
     tts_models, tts_cfg, tts_task = load_model_ensemble_and_task_from_hf_hub(
         tts_model_name[tgt_language],
-        arg_overrides={"vocoder": "hifigan", "fp16": True}
     )
-    tts_model = tts_models[0]
     TTSHubInterface.update_cfg_with_data_cfg(tts_cfg, tts_task.data_cfg)
-    tts_generator = tts_task.build_generator(tts_model, tts_cfg)
     tts_sample = TTSHubInterface.get_model_input(tts_task, translation)
     wav, rate = TTSHubInterface.get_prediction(tts_task, tts_model, tts_generator, tts_sample)
     # Returns the text
-    return transcript, translation, wav
@@ -166,7 +189,7 @@ gr.Interface(
     outputs=[
         gr.Text(label="Transcript"),
         gr.Text(label="Translation"),
-        gr.outputs.Audio(type="numpy", label="Translation Speech")
     ],
     title=title,
     description=description,

 # the model we are using for ASR, options are small, medium, large and largev2 (large and largev2 don't fit on huggingface cpu)
 model = whisper.load_model("small")
+import torch
 # A table to look up all the languages
 language_id_lookup = {
     "fr": "facebook/tts_transformer-fr-cv7_css10"
 }
+os.system("git clone https://github.com/Kyubyong/g2pC.git")
+os.system("cd g2pC; sed -i 's/pkuseg/spacy_pkuseg/g' setup.py; \
+          sed -i 's/import pkuseg/import spacy_pkuseg as pkuseg/g' g2pc/g2pc.py; \
+          sed -i 's/package_data={/# package_data={/g' setup.py; \
+          pip install ./; cd ..")
 # The predict function. audio, language and mic_audio are all parameters directly passed by gradio
 # which means they are user inputted. They are specified in gr.inputs[] block at the bottom. The
     from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
     tts_models, tts_cfg, tts_task = load_model_ensemble_and_task_from_hf_hub(
         tts_model_name[tgt_language],
+        arg_overrides={"vocoder": "hifigan", "fp16": False}
     )
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    tts_model = tts_models[0].to(device)
     TTSHubInterface.update_cfg_with_data_cfg(tts_cfg, tts_task.data_cfg)
+    tts_generator = tts_task.build_generator(tts_models, tts_cfg)
     tts_sample = TTSHubInterface.get_model_input(tts_task, translation)
+    tts_sample = {
+        'net_input': {
+            'src_tokens': tts_sample['net_input']['src_tokens'].to(device),
+            'src_lengths': tts_sample['net_input']['src_lengths'].to(device),
+            'prev_output_tokens': None,
+        },
+        'target_lengths': None,
+        'speaker': tts_sample['speaker'].to(device)
+    }
     wav, rate = TTSHubInterface.get_prediction(tts_task, tts_model, tts_generator, tts_sample)
+    wav = wav.cpu().numpy()
     # Returns the text
+    return transcript, translation, (rate, wav)
     outputs=[
         gr.Text(label="Transcript"),
         gr.Text(label="Translation"),
+        gr.Audio(label="Translation Speech")
     ],
     title=title,
     description=description,

requirements.txt CHANGED Viewed

@@ -7,4 +7,5 @@ sacremoses
 kytea
 six
 phonemizer
-sentencepiece

 kytea
 six
 phonemizer
+sentencepiece
+hanziconv