Spaces:
Sleeping
Sleeping
add tts
Browse files- app.py +28 -5
- requirements.txt +2 -1
app.py
CHANGED
@@ -7,6 +7,7 @@ import whisper
|
|
7 |
# the model we are using for ASR, options are small, medium, large and largev2 (large and largev2 don't fit on huggingface cpu)
|
8 |
model = whisper.load_model("small")
|
9 |
|
|
|
10 |
|
11 |
# A table to look up all the languages
|
12 |
language_id_lookup = {
|
@@ -43,6 +44,13 @@ tts_model_name = {
|
|
43 |
"fr": "facebook/tts_transformer-fr-cv7_css10"
|
44 |
}
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
# The predict function. audio, language and mic_audio are all parameters directly passed by gradio
|
48 |
# which means they are user inputted. They are specified in gr.inputs[] block at the bottom. The
|
@@ -119,16 +127,31 @@ def predict(audio, src_language, tgt_language, mic_audio=None):
|
|
119 |
from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
|
120 |
tts_models, tts_cfg, tts_task = load_model_ensemble_and_task_from_hf_hub(
|
121 |
tts_model_name[tgt_language],
|
122 |
-
arg_overrides={"vocoder": "hifigan", "fp16":
|
123 |
)
|
124 |
-
|
|
|
125 |
TTSHubInterface.update_cfg_with_data_cfg(tts_cfg, tts_task.data_cfg)
|
126 |
-
tts_generator = tts_task.build_generator(
|
127 |
tts_sample = TTSHubInterface.get_model_input(tts_task, translation)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
wav, rate = TTSHubInterface.get_prediction(tts_task, tts_model, tts_generator, tts_sample)
|
|
|
129 |
|
130 |
# Returns the text
|
131 |
-
return transcript, translation, wav
|
132 |
|
133 |
|
134 |
|
@@ -166,7 +189,7 @@ gr.Interface(
|
|
166 |
outputs=[
|
167 |
gr.Text(label="Transcript"),
|
168 |
gr.Text(label="Translation"),
|
169 |
-
gr.
|
170 |
],
|
171 |
title=title,
|
172 |
description=description,
|
|
|
7 |
# the model we are using for ASR, options are small, medium, large and largev2 (large and largev2 don't fit on huggingface cpu)
|
8 |
model = whisper.load_model("small")
|
9 |
|
10 |
+
import torch
|
11 |
|
12 |
# A table to look up all the languages
|
13 |
language_id_lookup = {
|
|
|
44 |
"fr": "facebook/tts_transformer-fr-cv7_css10"
|
45 |
}
|
46 |
|
47 |
+
os.system("git clone https://github.com/Kyubyong/g2pC.git")
|
48 |
+
os.system("cd g2pC; sed -i 's/pkuseg/spacy_pkuseg/g' setup.py; \
|
49 |
+
sed -i 's/import pkuseg/import spacy_pkuseg as pkuseg/g' g2pc/g2pc.py; \
|
50 |
+
sed -i 's/package_data={/# package_data={/g' setup.py; \
|
51 |
+
pip install ./; cd ..")
|
52 |
+
|
53 |
+
|
54 |
|
55 |
# The predict function. audio, language and mic_audio are all parameters directly passed by gradio
|
56 |
# which means they are user inputted. They are specified in gr.inputs[] block at the bottom. The
|
|
|
127 |
from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
|
128 |
tts_models, tts_cfg, tts_task = load_model_ensemble_and_task_from_hf_hub(
|
129 |
tts_model_name[tgt_language],
|
130 |
+
arg_overrides={"vocoder": "hifigan", "fp16": False}
|
131 |
)
|
132 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
133 |
+
tts_model = tts_models[0].to(device)
|
134 |
TTSHubInterface.update_cfg_with_data_cfg(tts_cfg, tts_task.data_cfg)
|
135 |
+
tts_generator = tts_task.build_generator(tts_models, tts_cfg)
|
136 |
tts_sample = TTSHubInterface.get_model_input(tts_task, translation)
|
137 |
+
|
138 |
+
|
139 |
+
|
140 |
+
tts_sample = {
|
141 |
+
'net_input': {
|
142 |
+
'src_tokens': tts_sample['net_input']['src_tokens'].to(device),
|
143 |
+
'src_lengths': tts_sample['net_input']['src_lengths'].to(device),
|
144 |
+
'prev_output_tokens': None,
|
145 |
+
},
|
146 |
+
'target_lengths': None,
|
147 |
+
'speaker': tts_sample['speaker'].to(device)
|
148 |
+
}
|
149 |
+
|
150 |
wav, rate = TTSHubInterface.get_prediction(tts_task, tts_model, tts_generator, tts_sample)
|
151 |
+
wav = wav.cpu().numpy()
|
152 |
|
153 |
# Returns the text
|
154 |
+
return transcript, translation, (rate, wav)
|
155 |
|
156 |
|
157 |
|
|
|
189 |
outputs=[
|
190 |
gr.Text(label="Transcript"),
|
191 |
gr.Text(label="Translation"),
|
192 |
+
gr.Audio(label="Translation Speech")
|
193 |
],
|
194 |
title=title,
|
195 |
description=description,
|
requirements.txt
CHANGED
@@ -7,4 +7,5 @@ sacremoses
|
|
7 |
kytea
|
8 |
six
|
9 |
phonemizer
|
10 |
-
sentencepiece
|
|
|
|
7 |
kytea
|
8 |
six
|
9 |
phonemizer
|
10 |
+
sentencepiece
|
11 |
+
hanziconv
|