owaski commited on
Commit
566d6f4
·
1 Parent(s): b90393d

new dependency

Browse files
Files changed (2) hide show
  1. app.py +31 -3
  2. requirements.txt +2 -1
app.py CHANGED
@@ -13,7 +13,6 @@ language_id_lookup = {
13
  "Arabic" : "ar",
14
  "English" : "en",
15
  "Chinese" : "zh",
16
- "German" : "de",
17
  "Spanish" : "es",
18
  "Russian" : "ru",
19
  "French" : "fr",
@@ -33,6 +32,18 @@ os.system("wget https://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/acl202
33
  os.system("wget https://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/emnlp2020/mrasp/pretrain/dataset/codes.bpe.32000")
34
 
35
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  # The predict function. audio, language and mic_audio are all parameters directly passed by gradio
37
  # which means they are user inputted. They are specified in gr.inputs[] block at the bottom. The
38
  # gr.outputs[] block will specify the output type.
@@ -64,12 +75,15 @@ def predict(audio, src_language, tgt_language, mic_audio=None):
64
  # Runs the audio through the whisper model and gets the DecodingResult object, which has the features:
65
  # audio_features (Tensor), language, language_probs, tokens, text, avg_logprob, no_speech_prob, temperature, compression_ratio
66
 
67
- options = whisper.DecodingOptions(fp16 = False, language = src_language)
 
68
  result = whisper.decode(model, mel, options)
69
  if src_language is None:
70
  src_language = result.language
71
 
72
  transcript = result.text
 
 
73
  with open("input." + src_language, 'w') as w:
74
  w.write(result.text)
75
  with open("input." + tgt_language, 'w') as w:
@@ -100,8 +114,21 @@ def predict(audio, src_language, tgt_language, mic_audio=None):
100
  with open("output", 'r') as r:
101
  translation = (' '.join(r.readline().split(' ')[3:])).strip()
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  # Returns the text
104
- return transcript, translation
105
 
106
 
107
 
@@ -139,6 +166,7 @@ gr.Interface(
139
  outputs=[
140
  gr.Text(label="Transcript"),
141
  gr.Text(label="Translation"),
 
142
  ],
143
  title=title,
144
  description=description,
 
13
  "Arabic" : "ar",
14
  "English" : "en",
15
  "Chinese" : "zh",
 
16
  "Spanish" : "es",
17
  "Russian" : "ru",
18
  "French" : "fr",
 
32
  os.system("wget https://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/emnlp2020/mrasp/pretrain/dataset/codes.bpe.32000")
33
 
34
 
35
+ # load tts
36
+ os.system("apt-get install espeak -y")
37
+ tts_model_name = {
38
+ "ar": "facebook/tts_transformer-ar-cv7",
39
+ "en": "facebook/tts_transformer-en-200_speaker-cv4",
40
+ "zh": "facebook/tts_transformer-zh-cv7_css10",
41
+ "es": "facebook/tts_transformer-es-css10",
42
+ "ru": "facebook/tts_transformer-ru-cv7_css10",
43
+ "fr": "facebook/tts_transformer-fr-cv7_css10"
44
+ }
45
+
46
+
47
  # The predict function. audio, language and mic_audio are all parameters directly passed by gradio
48
  # which means they are user inputted. They are specified in gr.inputs[] block at the bottom. The
49
  # gr.outputs[] block will specify the output type.
 
75
  # Runs the audio through the whisper model and gets the DecodingResult object, which has the features:
76
  # audio_features (Tensor), language, language_probs, tokens, text, avg_logprob, no_speech_prob, temperature, compression_ratio
77
 
78
+ # asr
79
+ options = whisper.DecodingOptions(fp16 = True, language = src_language)
80
  result = whisper.decode(model, mel, options)
81
  if src_language is None:
82
  src_language = result.language
83
 
84
  transcript = result.text
85
+
86
+ # mt
87
  with open("input." + src_language, 'w') as w:
88
  w.write(result.text)
89
  with open("input." + tgt_language, 'w') as w:
 
114
  with open("output", 'r') as r:
115
  translation = (' '.join(r.readline().split(' ')[3:])).strip()
116
 
117
+ # tts
118
+ from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
119
+ from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
120
+ tts_models, tts_cfg, tts_task = load_model_ensemble_and_task_from_hf_hub(
121
+ tts_model_name[tgt_language],
122
+ arg_overrides={"vocoder": "hifigan", "fp16": True}
123
+ )
124
+ tts_model = tts_models[0]
125
+ TTSHubInterface.update_cfg_with_data_cfg(tts_cfg, tts_task.data_cfg)
126
+ tts_generator = tts_task.build_generator(tts_model, tts_cfg)
127
+ tts_sample = TTSHubInterface.get_model_input(tts_task, translation)
128
+ wav, rate = TTSHubInterface.get_prediction(tts_task, tts_model, tts_generator, tts_sample)
129
+
130
  # Returns the text
131
+ return transcript, translation, wav
132
 
133
 
134
 
 
166
  outputs=[
167
  gr.Text(label="Transcript"),
168
  gr.Text(label="Translation"),
169
+ gr.outputs.Audio(type="numpy", label="Translation Speech")
170
  ],
171
  title=title,
172
  description=description,
requirements.txt CHANGED
@@ -6,4 +6,5 @@ sacrebleu
6
  sacremoses
7
  kytea
8
  six
9
- TTS
 
 
6
  sacremoses
7
  kytea
8
  six
9
+ phonemizer
10
+ sentencepiece