Spaces:
Sleeping
Sleeping
add mRASP2
Browse files
app.py
CHANGED
@@ -19,12 +19,24 @@ language_id_lookup = {
|
|
19 |
"French" : "fr",
|
20 |
}
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
|
24 |
# The predict function. audio, language and mic_audio are all parameters directly passed by gradio
|
25 |
# which means they are user inputted. They are specified in gr.inputs[] block at the bottom. The
|
26 |
# gr.outputs[] block will specify the output type.
|
27 |
-
def predict(audio,
|
28 |
|
29 |
# checks if mic_audio is used, otherwise feeds model uploaded audio
|
30 |
if mic_audio is not None:
|
@@ -43,22 +55,51 @@ def predict(audio, language, mic_audio=None):
|
|
43 |
|
44 |
# if model is supposed to detect language, set outLanguage to None
|
45 |
# otherwise set to specified language
|
46 |
-
if(
|
47 |
-
|
48 |
else:
|
49 |
-
|
50 |
|
51 |
# Runs the audio through the whisper model and gets the DecodingResult object, which has the features:
|
52 |
# audio_features (Tensor), language, language_probs, tokens, text, avg_logprob, no_speech_prob, temperature, compression_ratio
|
53 |
|
54 |
-
options = whisper.DecodingOptions(fp16 = False, language =
|
55 |
result = whisper.decode(model, mel, options)
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
|
64 |
|
@@ -75,14 +116,20 @@ gr.Interface(
|
|
75 |
fn=predict,
|
76 |
inputs=[
|
77 |
gr.Audio(label="Upload Speech", source="upload", type="filepath"),
|
78 |
-
gr.inputs.Dropdown(['Arabic
|
79 |
-
'Chinese
|
80 |
-
'English
|
81 |
-
'
|
82 |
-
'
|
83 |
-
'
|
84 |
-
'
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
gr.Audio(label="Record Speech", source="microphone", type="filepath"),
|
87 |
],
|
88 |
# To change to output audio, replace the outputs line with
|
|
|
19 |
"French" : "fr",
|
20 |
}
|
21 |
|
22 |
+
# load mRASP2
|
23 |
+
os.system("git clone https://github.com/PANXiao1994/mRASP2.git")
|
24 |
+
os.system('mv -n mRASP2/* ./')
|
25 |
+
os.system("rm -rf mRASP2")
|
26 |
+
os.system("pip install -r requirements.txt")
|
27 |
+
os.system("git clone https://github.com/pytorch/fairseq")
|
28 |
+
os.system("pip install --editable ./fairseq/")
|
29 |
+
|
30 |
+
model_name = "6e6d_no_mono.pt"
|
31 |
+
os.system("wget https://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/acl2021/mrasp2/" + model_name)
|
32 |
+
os.system("wget https://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/acl2021/mrasp2/bpe_vocab")
|
33 |
+
os.system("wget https://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/emnlp2020/mrasp/pretrain/dataset/codes.bpe.32000")
|
34 |
|
35 |
|
36 |
# The predict function. audio, language and mic_audio are all parameters directly passed by gradio
|
37 |
# which means they are user inputted. They are specified in gr.inputs[] block at the bottom. The
|
38 |
# gr.outputs[] block will specify the output type.
|
39 |
+
def predict(audio, src_language, tgt_language, mic_audio=None):
|
40 |
|
41 |
# checks if mic_audio is used, otherwise feeds model uploaded audio
|
42 |
if mic_audio is not None:
|
|
|
55 |
|
56 |
# if model is supposed to detect language, set outLanguage to None
|
57 |
# otherwise set to specified language
|
58 |
+
if(src_language == "Detect Language"):
|
59 |
+
src_language = None
|
60 |
else:
|
61 |
+
src_language = language_id_lookup[src_language.split()[0]]
|
62 |
|
63 |
# Runs the audio through the whisper model and gets the DecodingResult object, which has the features:
|
64 |
# audio_features (Tensor), language, language_probs, tokens, text, avg_logprob, no_speech_prob, temperature, compression_ratio
|
65 |
|
66 |
+
options = whisper.DecodingOptions(fp16 = False, language = src_language)
|
67 |
result = whisper.decode(model, mel, options)
|
68 |
+
if src_language is None:
|
69 |
+
src_language = result.language
|
70 |
+
|
71 |
+
with open("input." + src_language, 'w') as w:
|
72 |
+
w.write(result.text)
|
73 |
+
with open("input." + tgt_language, 'w') as w:
|
74 |
+
w.write('LANG_TOK_' + src_language.upper())
|
75 |
+
|
76 |
+
os.system("python fairseq/fairseq_cli/preprocess.py --dataset-impl raw \
|
77 |
+
--srcdict bpe_vocab --tgtdict bpe_vocab --testpref input -s {} -t {}".format( \
|
78 |
+
src_language, tgt_language))
|
79 |
+
|
80 |
+
os.system("python fairseq/fairseq_cli/interactive.py /mnt/data2/siqiouyang/demo/mRASP2/data-bin \
|
81 |
+
--user-dir mcolt \
|
82 |
+
-s zh \
|
83 |
+
-t en \
|
84 |
+
--skip-invalid-size-inputs-valid-test \
|
85 |
+
--path {} \
|
86 |
+
--max-tokens 1024 \
|
87 |
+
--task translation_w_langtok \
|
88 |
+
--lang-prefix-tok \"LANG_TOK_{}\" \
|
89 |
+
--max-source-positions 1024 \
|
90 |
+
--max-target-positions 1024 \
|
91 |
+
--nbest 1 \
|
92 |
+
--bpe subword_nmt \
|
93 |
+
--bpe-codes codes.bpe.32000 \
|
94 |
+
--post-process --tokenizer moses \
|
95 |
+
--input input.{} | grep -E '[D]-[0-9]+' > output".format(
|
96 |
+
model_name, tgt_language.upper(), src_language))
|
97 |
+
|
98 |
+
with open("output", 'r') as r:
|
99 |
+
translation = (' '.join(r.readline().split(' ')[3:])).strip()
|
100 |
+
|
101 |
+
# Returns the text
|
102 |
+
return translation
|
103 |
|
104 |
|
105 |
|
|
|
116 |
fn=predict,
|
117 |
inputs=[
|
118 |
gr.Audio(label="Upload Speech", source="upload", type="filepath"),
|
119 |
+
gr.inputs.Dropdown(['Arabic',
|
120 |
+
'Chinese',
|
121 |
+
'English',
|
122 |
+
'Spanish',
|
123 |
+
'Russian',
|
124 |
+
'French',
|
125 |
+
'Detect Language'], type="value", default='English', label="Select the language of input"),
|
126 |
+
gr.inputs.Dropdown(['Arabic',
|
127 |
+
'Chinese',
|
128 |
+
'English',
|
129 |
+
'Spanish',
|
130 |
+
'Russian',
|
131 |
+
'French',
|
132 |
+
'Detect Language'], type="value", default='English', label="Select the language of output"),
|
133 |
gr.Audio(label="Record Speech", source="microphone", type="filepath"),
|
134 |
],
|
135 |
# To change to output audio, replace the outputs line with
|