Spaces:
Sleeping
Sleeping
use Toucan tts
Browse files
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
# imports
|
2 |
import os
|
|
|
3 |
os.system("pip install git+https://github.com/openai/whisper.git")
|
4 |
import gradio as gr
|
5 |
import whisper
|
@@ -35,22 +36,23 @@ os.system("wget https://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/emnlp2
|
|
35 |
|
36 |
# load tts
|
37 |
os.system("apt-get install espeak -y")
|
38 |
-
tts_model_name = {
|
39 |
-
"ar": "facebook/tts_transformer-ar-cv7",
|
40 |
-
"en": "facebook/tts_transformer-en-ljspeech",
|
41 |
-
"zh": "facebook/tts_transformer-zh-cv7_css10",
|
42 |
-
"es": "facebook/tts_transformer-es-css10",
|
43 |
-
"ru": "facebook/tts_transformer-ru-cv7_css10",
|
44 |
-
"fr": "facebook/tts_transformer-fr-cv7_css10"
|
45 |
-
}
|
46 |
|
47 |
-
os.system("git clone https://github.com/Kyubyong/g2pC.git")
|
48 |
-
os.system("cd g2pC; sed -i 's/pkuseg/spacy_pkuseg/g' setup.py; \
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
|
|
|
|
|
|
|
|
|
53 |
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
# The predict function. audio, language and mic_audio are all parameters directly passed by gradio
|
56 |
# which means they are user inputted. They are specified in gr.inputs[] block at the bottom. The
|
@@ -123,35 +125,11 @@ def predict(audio, src_language, tgt_language, mic_audio=None):
|
|
123 |
translation = (' '.join(r.readline().split(' ')[3:])).strip()
|
124 |
|
125 |
# tts
|
126 |
-
|
127 |
-
|
128 |
-
tts_models, tts_cfg, tts_task = load_model_ensemble_and_task_from_hf_hub(
|
129 |
-
tts_model_name[tgt_language],
|
130 |
-
arg_overrides={"vocoder": "hifigan", "fp16": False}
|
131 |
-
)
|
132 |
-
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
133 |
-
tts_model = tts_models[0].to(device)
|
134 |
-
TTSHubInterface.update_cfg_with_data_cfg(tts_cfg, tts_task.data_cfg)
|
135 |
-
tts_generator = tts_task.build_generator(tts_models, tts_cfg)
|
136 |
-
tts_sample = TTSHubInterface.get_model_input(tts_task, translation)
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
tts_sample = {
|
141 |
-
'net_input': {
|
142 |
-
'src_tokens': tts_sample['net_input']['src_tokens'].to(device),
|
143 |
-
'src_lengths': tts_sample['net_input']['src_lengths'].to(device),
|
144 |
-
'prev_output_tokens': None,
|
145 |
-
},
|
146 |
-
'target_lengths': None,
|
147 |
-
'speaker': tts_sample['speaker'].to(device) if tts_sample['speaker'] is not None else None,
|
148 |
-
}
|
149 |
-
|
150 |
-
wav, rate = TTSHubInterface.get_prediction(tts_task, tts_model, tts_generator, tts_sample)
|
151 |
-
wav = wav.cpu().numpy()
|
152 |
|
153 |
# Returns the text
|
154 |
-
return transcript, translation,
|
155 |
|
156 |
|
157 |
|
@@ -175,8 +153,8 @@ gr.Interface(
|
|
175 |
'Russian',
|
176 |
'French',
|
177 |
'Detect Language'], type="value", default='English', label="Select the language of input"),
|
178 |
-
gr.inputs.Dropdown(['Arabic',
|
179 |
-
'Chinese',
|
180 |
'English',
|
181 |
'Spanish',
|
182 |
'Russian',
|
|
|
1 |
# imports
|
2 |
import os
|
3 |
+
import sys
|
4 |
os.system("pip install git+https://github.com/openai/whisper.git")
|
5 |
import gradio as gr
|
6 |
import whisper
|
|
|
36 |
|
37 |
# load tts
|
38 |
os.system("apt-get install espeak -y")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
+
# os.system("git clone https://github.com/Kyubyong/g2pC.git")
|
41 |
+
# os.system("cd g2pC; sed -i 's/pkuseg/spacy_pkuseg/g' setup.py; \
|
42 |
+
# sed -i 's/import pkuseg/import spacy_pkuseg as pkuseg/g' g2pc/g2pc.py; \
|
43 |
+
# sed -i 's/package_data={/# package_data={/g' setup.py; \
|
44 |
+
# pip install ./; cd ..")
|
45 |
|
46 |
+
os.system("git clone https://github.com/DigitalPhonetics/IMS-Toucan.git")
|
47 |
+
sys.path.append('./IMS-Toucan')
|
48 |
+
os.system("cd IMS-Toucan; pip install --no-cache-dir -r requirements.txt")
|
49 |
+
os.system("python run_model_downloader.py; cd ..")
|
50 |
|
51 |
+
from InferenceInterfaces.PortaSpeechInterface import PortaSpeechInterface
|
52 |
+
cwd = os.getcwd()
|
53 |
+
os.chdir('./IMS-Toucan')
|
54 |
+
tts = PortaSpeechInterface(device='cpu', tts_model_path='Meta')
|
55 |
+
os.chdir(cwd)
|
56 |
|
57 |
# The predict function. audio, language and mic_audio are all parameters directly passed by gradio
|
58 |
# which means they are user inputted. They are specified in gr.inputs[] block at the bottom. The
|
|
|
125 |
translation = (' '.join(r.readline().split(' ')[3:])).strip()
|
126 |
|
127 |
# tts
|
128 |
+
tts.set_language(tgt_language)
|
129 |
+
tts.read_to_file(text_list=[translation], file_location='output.wav')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
|
131 |
# Returns the text
|
132 |
+
return transcript, translation, 'output.wav'
|
133 |
|
134 |
|
135 |
|
|
|
153 |
'Russian',
|
154 |
'French',
|
155 |
'Detect Language'], type="value", default='English', label="Select the language of input"),
|
156 |
+
gr.inputs.Dropdown([# 'Arabic',
|
157 |
+
# 'Chinese',
|
158 |
'English',
|
159 |
'Spanish',
|
160 |
'Russian',
|