Spaces:
Sleeping
Sleeping
new dependency
Browse files- app.py +31 -3
- requirements.txt +2 -1
app.py
CHANGED
@@ -13,7 +13,6 @@ language_id_lookup = {
|
|
13 |
"Arabic" : "ar",
|
14 |
"English" : "en",
|
15 |
"Chinese" : "zh",
|
16 |
-
"German" : "de",
|
17 |
"Spanish" : "es",
|
18 |
"Russian" : "ru",
|
19 |
"French" : "fr",
|
@@ -33,6 +32,18 @@ os.system("wget https://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/acl202
|
|
33 |
os.system("wget https://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/emnlp2020/mrasp/pretrain/dataset/codes.bpe.32000")
|
34 |
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
# The predict function. audio, language and mic_audio are all parameters directly passed by gradio
|
37 |
# which means they are user inputted. They are specified in gr.inputs[] block at the bottom. The
|
38 |
# gr.outputs[] block will specify the output type.
|
@@ -64,12 +75,15 @@ def predict(audio, src_language, tgt_language, mic_audio=None):
|
|
64 |
# Runs the audio through the whisper model and gets the DecodingResult object, which has the features:
|
65 |
# audio_features (Tensor), language, language_probs, tokens, text, avg_logprob, no_speech_prob, temperature, compression_ratio
|
66 |
|
67 |
-
|
|
|
68 |
result = whisper.decode(model, mel, options)
|
69 |
if src_language is None:
|
70 |
src_language = result.language
|
71 |
|
72 |
transcript = result.text
|
|
|
|
|
73 |
with open("input." + src_language, 'w') as w:
|
74 |
w.write(result.text)
|
75 |
with open("input." + tgt_language, 'w') as w:
|
@@ -100,8 +114,21 @@ def predict(audio, src_language, tgt_language, mic_audio=None):
|
|
100 |
with open("output", 'r') as r:
|
101 |
translation = (' '.join(r.readline().split(' ')[3:])).strip()
|
102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
# Returns the text
|
104 |
-
return transcript, translation
|
105 |
|
106 |
|
107 |
|
@@ -139,6 +166,7 @@ gr.Interface(
|
|
139 |
outputs=[
|
140 |
gr.Text(label="Transcript"),
|
141 |
gr.Text(label="Translation"),
|
|
|
142 |
],
|
143 |
title=title,
|
144 |
description=description,
|
|
|
13 |
"Arabic" : "ar",
|
14 |
"English" : "en",
|
15 |
"Chinese" : "zh",
|
|
|
16 |
"Spanish" : "es",
|
17 |
"Russian" : "ru",
|
18 |
"French" : "fr",
|
|
|
32 |
os.system("wget https://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/emnlp2020/mrasp/pretrain/dataset/codes.bpe.32000")
|
33 |
|
34 |
|
35 |
+
# load tts
|
36 |
+
os.system("apt-get install espeak -y")
|
37 |
+
tts_model_name = {
|
38 |
+
"ar": "facebook/tts_transformer-ar-cv7",
|
39 |
+
"en": "facebook/tts_transformer-en-200_speaker-cv4",
|
40 |
+
"zh": "facebook/tts_transformer-zh-cv7_css10",
|
41 |
+
"es": "facebook/tts_transformer-es-css10",
|
42 |
+
"ru": "facebook/tts_transformer-ru-cv7_css10",
|
43 |
+
"fr": "facebook/tts_transformer-fr-cv7_css10"
|
44 |
+
}
|
45 |
+
|
46 |
+
|
47 |
# The predict function. audio, language and mic_audio are all parameters directly passed by gradio
|
48 |
# which means they are user inputted. They are specified in gr.inputs[] block at the bottom. The
|
49 |
# gr.outputs[] block will specify the output type.
|
|
|
75 |
# Runs the audio through the whisper model and gets the DecodingResult object, which has the features:
|
76 |
# audio_features (Tensor), language, language_probs, tokens, text, avg_logprob, no_speech_prob, temperature, compression_ratio
|
77 |
|
78 |
+
# asr
|
79 |
+
options = whisper.DecodingOptions(fp16 = True, language = src_language)
|
80 |
result = whisper.decode(model, mel, options)
|
81 |
if src_language is None:
|
82 |
src_language = result.language
|
83 |
|
84 |
transcript = result.text
|
85 |
+
|
86 |
+
# mt
|
87 |
with open("input." + src_language, 'w') as w:
|
88 |
w.write(result.text)
|
89 |
with open("input." + tgt_language, 'w') as w:
|
|
|
114 |
with open("output", 'r') as r:
|
115 |
translation = (' '.join(r.readline().split(' ')[3:])).strip()
|
116 |
|
117 |
+
# tts
|
118 |
+
from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
|
119 |
+
from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
|
120 |
+
tts_models, tts_cfg, tts_task = load_model_ensemble_and_task_from_hf_hub(
|
121 |
+
tts_model_name[tgt_language],
|
122 |
+
arg_overrides={"vocoder": "hifigan", "fp16": True}
|
123 |
+
)
|
124 |
+
tts_model = tts_models[0]
|
125 |
+
TTSHubInterface.update_cfg_with_data_cfg(tts_cfg, tts_task.data_cfg)
|
126 |
+
tts_generator = tts_task.build_generator(tts_model, tts_cfg)
|
127 |
+
tts_sample = TTSHubInterface.get_model_input(tts_task, translation)
|
128 |
+
wav, rate = TTSHubInterface.get_prediction(tts_task, tts_model, tts_generator, tts_sample)
|
129 |
+
|
130 |
# Returns the text
|
131 |
+
return transcript, translation, wav
|
132 |
|
133 |
|
134 |
|
|
|
166 |
outputs=[
|
167 |
gr.Text(label="Transcript"),
|
168 |
gr.Text(label="Translation"),
|
169 |
+
gr.outputs.Audio(type="numpy", label="Translation Speech")
|
170 |
],
|
171 |
title=title,
|
172 |
description=description,
|
requirements.txt
CHANGED
@@ -6,4 +6,5 @@ sacrebleu
|
|
6 |
sacremoses
|
7 |
kytea
|
8 |
six
|
9 |
-
|
|
|
|
6 |
sacremoses
|
7 |
kytea
|
8 |
six
|
9 |
+
phonemizer
|
10 |
+
sentencepiece
|