owaski commited on
Commit
ce0d186
·
1 Parent(s): 8d8f4e3

use Toucan tts

Browse files
Files changed (1) hide show
  1. app.py +20 -42
app.py CHANGED
@@ -1,5 +1,6 @@
1
  # imports
2
  import os
 
3
  os.system("pip install git+https://github.com/openai/whisper.git")
4
  import gradio as gr
5
  import whisper
@@ -35,22 +36,23 @@ os.system("wget https://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/emnlp2
35
 
36
  # load tts
37
  os.system("apt-get install espeak -y")
38
- tts_model_name = {
39
- "ar": "facebook/tts_transformer-ar-cv7",
40
- "en": "facebook/tts_transformer-en-ljspeech",
41
- "zh": "facebook/tts_transformer-zh-cv7_css10",
42
- "es": "facebook/tts_transformer-es-css10",
43
- "ru": "facebook/tts_transformer-ru-cv7_css10",
44
- "fr": "facebook/tts_transformer-fr-cv7_css10"
45
- }
46
 
47
- os.system("git clone https://github.com/Kyubyong/g2pC.git")
48
- os.system("cd g2pC; sed -i 's/pkuseg/spacy_pkuseg/g' setup.py; \
49
- sed -i 's/import pkuseg/import spacy_pkuseg as pkuseg/g' g2pc/g2pc.py; \
50
- sed -i 's/package_data={/# package_data={/g' setup.py; \
51
- pip install ./; cd ..")
52
 
 
 
 
 
53
 
 
 
 
 
 
54
 
55
  # The predict function. audio, language and mic_audio are all parameters directly passed by gradio
56
  # which means they are user inputted. They are specified in gr.inputs[] block at the bottom. The
@@ -123,35 +125,11 @@ def predict(audio, src_language, tgt_language, mic_audio=None):
123
  translation = (' '.join(r.readline().split(' ')[3:])).strip()
124
 
125
  # tts
126
- from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
127
- from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
128
- tts_models, tts_cfg, tts_task = load_model_ensemble_and_task_from_hf_hub(
129
- tts_model_name[tgt_language],
130
- arg_overrides={"vocoder": "hifigan", "fp16": False}
131
- )
132
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
133
- tts_model = tts_models[0].to(device)
134
- TTSHubInterface.update_cfg_with_data_cfg(tts_cfg, tts_task.data_cfg)
135
- tts_generator = tts_task.build_generator(tts_models, tts_cfg)
136
- tts_sample = TTSHubInterface.get_model_input(tts_task, translation)
137
-
138
-
139
-
140
- tts_sample = {
141
- 'net_input': {
142
- 'src_tokens': tts_sample['net_input']['src_tokens'].to(device),
143
- 'src_lengths': tts_sample['net_input']['src_lengths'].to(device),
144
- 'prev_output_tokens': None,
145
- },
146
- 'target_lengths': None,
147
- 'speaker': tts_sample['speaker'].to(device) if tts_sample['speaker'] is not None else None,
148
- }
149
-
150
- wav, rate = TTSHubInterface.get_prediction(tts_task, tts_model, tts_generator, tts_sample)
151
- wav = wav.cpu().numpy()
152
 
153
  # Returns the text
154
- return transcript, translation, (rate, wav)
155
 
156
 
157
 
@@ -175,8 +153,8 @@ gr.Interface(
175
  'Russian',
176
  'French',
177
  'Detect Language'], type="value", default='English', label="Select the language of input"),
178
- gr.inputs.Dropdown(['Arabic',
179
- 'Chinese',
180
  'English',
181
  'Spanish',
182
  'Russian',
 
1
  # imports
2
  import os
3
+ import sys
4
  os.system("pip install git+https://github.com/openai/whisper.git")
5
  import gradio as gr
6
  import whisper
 
36
 
37
  # load tts
38
  os.system("apt-get install espeak -y")
 
 
 
 
 
 
 
 
39
 
40
+ # os.system("git clone https://github.com/Kyubyong/g2pC.git")
41
+ # os.system("cd g2pC; sed -i 's/pkuseg/spacy_pkuseg/g' setup.py; \
42
+ # sed -i 's/import pkuseg/import spacy_pkuseg as pkuseg/g' g2pc/g2pc.py; \
43
+ # sed -i 's/package_data={/# package_data={/g' setup.py; \
44
+ # pip install ./; cd ..")
45
 
46
+ os.system("git clone https://github.com/DigitalPhonetics/IMS-Toucan.git")
47
+ sys.path.append('./IMS-Toucan')
48
+ os.system("cd IMS-Toucan; pip install --no-cache-dir -r requirements.txt")
49
+ os.system("python run_model_downloader.py; cd ..")
50
 
51
+ from InferenceInterfaces.PortaSpeechInterface import PortaSpeechInterface
52
+ cwd = os.getcwd()
53
+ os.chdir('./IMS-Toucan')
54
+ tts = PortaSpeechInterface(device='cpu', tts_model_path='Meta')
55
+ os.chdir(cwd)
56
 
57
  # The predict function. audio, language and mic_audio are all parameters directly passed by gradio
58
  # which means they are user inputted. They are specified in gr.inputs[] block at the bottom. The
 
125
  translation = (' '.join(r.readline().split(' ')[3:])).strip()
126
 
127
  # tts
128
+ tts.set_language(tgt_language)
129
+ tts.read_to_file(text_list=[translation], file_location='output.wav')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
  # Returns the text
132
+ return transcript, translation, 'output.wav'
133
 
134
 
135
 
 
153
  'Russian',
154
  'French',
155
  'Detect Language'], type="value", default='English', label="Select the language of input"),
156
+ gr.inputs.Dropdown([# 'Arabic',
157
+ # 'Chinese',
158
  'English',
159
  'Spanish',
160
  'Russian',