owaski commited on
Commit
766a3e3
·
1 Parent(s): 843cbe3

add mRASP2

Browse files
Files changed (1) hide show
  1. app.py +66 -19
app.py CHANGED
@@ -19,12 +19,24 @@ language_id_lookup = {
19
  "French" : "fr",
20
  }
21
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
 
24
  # The predict function. audio, language and mic_audio are all parameters directly passed by gradio
25
  # which means they are user inputted. They are specified in gr.inputs[] block at the bottom. The
26
  # gr.outputs[] block will specify the output type.
27
- def predict(audio, language, mic_audio=None):
28
 
29
  # checks if mic_audio is used, otherwise feeds model uploaded audio
30
  if mic_audio is not None:
@@ -43,22 +55,51 @@ def predict(audio, language, mic_audio=None):
43
 
44
  # if model is supposed to detect language, set outLanguage to None
45
  # otherwise set to specified language
46
- if(language == "Detect Language"):
47
- outLanguage = None
48
  else:
49
- outLanguage = language_id_lookup[language.split()[0]]
50
 
51
  # Runs the audio through the whisper model and gets the DecodingResult object, which has the features:
52
  # audio_features (Tensor), language, language_probs, tokens, text, avg_logprob, no_speech_prob, temperature, compression_ratio
53
 
54
- options = whisper.DecodingOptions(fp16 = False, language = outLanguage)
55
  result = whisper.decode(model, mel, options)
56
- outLanguage = result.language
57
-
58
-
59
-
60
- # Returns the text and the language
61
- return result.text, outLanguage
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
 
64
 
@@ -75,14 +116,20 @@ gr.Interface(
75
  fn=predict,
76
  inputs=[
77
  gr.Audio(label="Upload Speech", source="upload", type="filepath"),
78
- gr.inputs.Dropdown(['Arabic Text',
79
- 'Chinese Text',
80
- 'English Text',
81
- 'German Text',
82
- 'Spanish Text',
83
- 'Russian Text',
84
- 'French Text',
85
- 'Detect Language'], type="value", default='English Text', label="Select the Language of the that you are speaking in."),
 
 
 
 
 
 
86
  gr.Audio(label="Record Speech", source="microphone", type="filepath"),
87
  ],
88
  # To change to output audio, replace the outputs line with
 
19
  "French" : "fr",
20
  }
21
 
22
+ # load mRASP2
23
+ os.system("git clone https://github.com/PANXiao1994/mRASP2.git")
24
+ os.system('mv -n mRASP2/* ./')
25
+ os.system("rm -rf mRASP2")
26
+ os.system("pip install -r requirements.txt")
27
+ os.system("git clone https://github.com/pytorch/fairseq")
28
+ os.system("pip install --editable ./fairseq/")
29
+
30
+ model_name = "6e6d_no_mono.pt"
31
+ os.system("wget https://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/acl2021/mrasp2/" + model_name)
32
+ os.system("wget https://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/acl2021/mrasp2/bpe_vocab")
33
+ os.system("wget https://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/emnlp2020/mrasp/pretrain/dataset/codes.bpe.32000")
34
 
35
 
36
  # The predict function. audio, language and mic_audio are all parameters directly passed by gradio
37
  # which means they are user inputted. They are specified in gr.inputs[] block at the bottom. The
38
  # gr.outputs[] block will specify the output type.
39
+ def predict(audio, src_language, tgt_language, mic_audio=None):
40
 
41
  # checks if mic_audio is used, otherwise feeds model uploaded audio
42
  if mic_audio is not None:
 
55
 
56
  # if model is supposed to detect language, set outLanguage to None
57
  # otherwise set to specified language
58
+ if(src_language == "Detect Language"):
59
+ src_language = None
60
  else:
61
+ src_language = language_id_lookup[src_language.split()[0]]
62
 
63
  # Runs the audio through the whisper model and gets the DecodingResult object, which has the features:
64
  # audio_features (Tensor), language, language_probs, tokens, text, avg_logprob, no_speech_prob, temperature, compression_ratio
65
 
66
+ options = whisper.DecodingOptions(fp16 = False, language = src_language)
67
  result = whisper.decode(model, mel, options)
68
+ if src_language is None:
69
+ src_language = result.language
70
+
71
+ with open("input." + src_language, 'w') as w:
72
+ w.write(result.text)
73
+ with open("input." + tgt_language, 'w') as w:
74
+ w.write('LANG_TOK_' + src_language.upper())
75
+
76
+ os.system("python fairseq/fairseq_cli/preprocess.py --dataset-impl raw \
77
+ --srcdict bpe_vocab --tgtdict bpe_vocab --testpref input -s {} -t {}".format( \
78
+ src_language, tgt_language))
79
+
80
+ os.system("python fairseq/fairseq_cli/interactive.py /mnt/data2/siqiouyang/demo/mRASP2/data-bin \
81
+ --user-dir mcolt \
82
+ -s zh \
83
+ -t en \
84
+ --skip-invalid-size-inputs-valid-test \
85
+ --path {} \
86
+ --max-tokens 1024 \
87
+ --task translation_w_langtok \
88
+ --lang-prefix-tok \"LANG_TOK_{}\" \
89
+ --max-source-positions 1024 \
90
+ --max-target-positions 1024 \
91
+ --nbest 1 \
92
+ --bpe subword_nmt \
93
+ --bpe-codes codes.bpe.32000 \
94
+ --post-process --tokenizer moses \
95
+ --input input.{} | grep -E '[D]-[0-9]+' > output".format(
96
+ model_name, tgt_language.upper(), src_language))
97
+
98
+ with open("output", 'r') as r:
99
+ translation = (' '.join(r.readline().split(' ')[3:])).strip()
100
+
101
+ # Returns the text
102
+ return translation
103
 
104
 
105
 
 
116
  fn=predict,
117
  inputs=[
118
  gr.Audio(label="Upload Speech", source="upload", type="filepath"),
119
+ gr.inputs.Dropdown(['Arabic',
120
+ 'Chinese',
121
+ 'English',
122
+ 'Spanish',
123
+ 'Russian',
124
+ 'French',
125
+ 'Detect Language'], type="value", default='English', label="Select the language of input"),
126
+ gr.inputs.Dropdown(['Arabic',
127
+ 'Chinese',
128
+ 'English',
129
+ 'Spanish',
130
+ 'Russian',
131
+ 'French',
132
+ 'Detect Language'], type="value", default='English', label="Select the language of output"),
133
  gr.Audio(label="Record Speech", source="microphone", type="filepath"),
134
  ],
135
  # To change to output audio, replace the outputs line with