chinmaydan commited on
Commit
b7a34b6
1 Parent(s): 5f50e60

Removed ConST

Browse files
Files changed (1) hide show
  1. app.py +2 -133
app.py CHANGED
@@ -93,9 +93,6 @@ os.system("pip install git+https://github.com/openai/whisper.git")
93
  #os.system("mkdir -p data checkpoint")
94
 
95
 
96
- huggingface_model_dir = snapshot_download(repo_id="ReneeYe/ConST_en2x_models")
97
- print(huggingface_model_dir)
98
-
99
 
100
 
101
  def restrict_src_options(model_type):
@@ -225,141 +222,13 @@ def predictWithmRASP2(input_audio, src_language, tgt_language):
225
  translation = (' '.join(translation.split(' ')[1:])).strip()
226
 
227
  mt_time = time.time() - mt_start_time
228
- print(f"Took {mt_time} to do Machine Translation")
229
- #print(model_name)
230
 
231
- #with open("output", 'r') as r:
232
- # translation = "Undefined"
233
- # translation = (' '.join(r.readline().split(' ')[1:])).strip()
234
- # print(translation)
235
 
236
  # Returns the text
237
- print("returning transcript: " + transcript + " and the translation: " + translation)
238
  return transcript, translation
239
 
240
 
241
 
242
- # Helper methods for ConST (as written in https://huggingface.co/spaces/ReneeYe/ConST-speech2text-translator/blob/main/app.py)
243
-
244
-
245
- def convert_audio_to_16k_wav(audio_input):
246
- sound = AudioSegment.from_file(audio_input)
247
- sample_rate = sound.frame_rate
248
- num_channels = sound.channels
249
- num_frames = int(sound.frame_count())
250
- filename = audio_input.split("/")[-1]
251
- print("original file is at:", audio_input)
252
- if (num_channels > 1) or (sample_rate != 16000): # convert to mono-channel 16k wav
253
- if num_channels > 1:
254
- sound = sound.set_channels(1)
255
- if sample_rate != 16000:
256
- sound = sound.set_frame_rate(16000)
257
- num_frames = int(sound.frame_count())
258
- filename = filename.replace(".wav", "") + "_16k.wav"
259
- sound.export(f"data/{filename}", format="wav")
260
- else:
261
- shutil.copy(audio_input, f'data/{filename}')
262
- return filename, num_frames
263
-
264
-
265
- def prepare_tsv(file_name, n_frame, language, task="ST"):
266
- tgt_lang = language_id_lookup[language]
267
- with open("data/test_case.tsv", "w") as f:
268
- f.write("id\taudio\tn_frames\ttgt_text\tspeaker\tsrc_lang\ttgt_lang\tsrc_text\n")
269
- f.write(f"sample\t{file_name}\t{n_frame}\tThis is in {tgt_lang}.\tspk.1\ten\t{tgt_lang}\tThis is English.\n")
270
-
271
-
272
- def get_vocab_and_yaml(language):
273
- tgt_lang = language_id_lookup[language]
274
- # get: spm_ende.model and spm_ende.txt, and save to data/xxx
275
- # if exist, no need to download
276
- shutil.copy(os.path.join(huggingface_model_dir, f"vocabulary/spm_en{tgt_lang}.model"), "./data")
277
- shutil.copy(os.path.join(huggingface_model_dir, f"vocabulary/spm_en{tgt_lang}.txt"), "./data")
278
-
279
- # write yaml file
280
- abs_path = os.popen("pwd").read().strip()
281
- yaml_dict = LANG_GEN_SETUPS[tgt_lang]
282
- yaml_dict["input_channels"] = 1
283
- yaml_dict["use_audio_input"] = True
284
- yaml_dict["prepend_tgt_lang_tag"] = True
285
- yaml_dict["prepend_src_lang_tag"] = True
286
- yaml_dict["audio_root"] = os.path.join(abs_path, "data")
287
- yaml_dict["vocab_filename"] = f"spm_en{tgt_lang}.txt"
288
- yaml_dict["bpe_tokenizer"] = {"bpe": "sentencepiece",
289
- "sentencepiece_model": os.path.join(abs_path, f"data/spm_en{tgt_lang}.model")}
290
- with open("data/config.yaml", "w") as f:
291
- yaml.dump(yaml_dict, f)
292
-
293
-
294
- def get_model(language):
295
- # download models to checkpoint/xxx
296
- return os.path.join(huggingface_model_dir, f"models/const_en{language_id_lookup[language]}.pt")
297
-
298
-
299
- def generate(model_path):
300
- os.system(f"python3 fairseq/fairseq_cli/generate.py data/ --gen-subset test_case --task speech_to_text --prefix-size 1 \
301
- --max-source-positions 4000000 \
302
- --config-yaml config.yaml --path {model_path} | tee temp.txt")
303
- print("No problem with 1st line")
304
- output = os.popen("grep ^D temp.txt | sort -n -k 2 -t '-' | cut -f 3")
305
- return output.read().strip()
306
-
307
-
308
- def post_processing(raw_sentence):
309
- output_sentence = raw_sentence
310
- if ":" in raw_sentence:
311
- splited_sent = raw_sentence.split(":")
312
- if len(splited_sent) == 2:
313
- prefix = splited_sent[0].strip()
314
- if len(prefix) <= 3:
315
- output_sentence = splited_sent[1].strip()
316
- elif ("(" in prefix) and (")" in prefix):
317
- bgm = re.findall(r"\(.*?\)", prefix)[0]
318
- if len(prefix.replace(bgm, "").strip()) <= 3:
319
- output_sentence = splited_sent[1].strip()
320
- elif len(splited_sent[1].strip()) > 8:
321
- output_sentence = splited_sent[1].strip()
322
-
323
- elif ("(" in raw_sentence) and (")" in raw_sentence):
324
- bgm_list = re.findall(r"\(.*?\)", raw_sentence)
325
- for bgm in bgm_list:
326
- if len(raw_sentence.replace(bgm, "").strip()) > 5:
327
- output_sentence = output_sentence.replace(bgm, "").strip()
328
- if len(output_sentence) <= 5:
329
- output_sentence = raw_sentence
330
- return output_sentence
331
-
332
-
333
- def remove_temp_files(audio_file):
334
- os.remove("temp.txt")
335
- os.remove("data/test_case.tsv")
336
- os.remove(f"data/{audio_file}")
337
-
338
-
339
-
340
- def error_output(language):
341
- return f"Fail to translate the audio into {language}, you may use the examples I provide."
342
-
343
- # Predicting the translation with ConST
344
- def predictWithConST(audio_file, language):
345
- try:
346
- converted_audio_file, n_frame = convert_audio_to_16k_wav(audio_file)
347
- prepare_tsv(converted_audio_file, n_frame, language)
348
- get_vocab_and_yaml(language)
349
- model_path = get_model(language)
350
- print("This is the model path: " + model_path)
351
- generate_model_path = generate(model_path)
352
- print("No problem generating model path")
353
- generated_output = post_processing(generate_model_path)
354
- print("No problem generating output")
355
- remove_temp_files(converted_audio_file)
356
- print("No problem removing_temp")
357
- return generated_output
358
- except:
359
- traceback.print_exc()
360
- return error_output(language)
361
-
362
-
363
  title = "Demo for Speech Translation (Whisper+mRASP2 and ConST)"
364
 
365
  description = """
@@ -381,7 +250,7 @@ with demo:
381
  gr.Markdown("###" + description)
382
  with gr.Row():
383
  with gr.Column():
384
- model_type = gr.Dropdown(['Whisper+mRASP2', 'ConST'], type = "value", value = 'Whisper+mRASP2', label = "Select the model you want to use.")
385
  audio_file = gr.Audio(label="Upload Speech", source="upload", type="filepath")
386
  src_language = gr.Dropdown(['Arabic',
387
  'Chinese',
@@ -417,4 +286,4 @@ with demo:
417
  submit_button.click(fn = predict, inputs=[audio_file, src_language, tgt_language_mRASP, tgt_language_ConST, model_type, mic_audio], outputs=[transcript, translate, translated_speech])
418
  switch_lang_button.click(switchLang, [src_language, tgt_language_mRASP], [src_language, tgt_language_mRASP])
419
 
420
- demo.launch(share= True)
 
93
  #os.system("mkdir -p data checkpoint")
94
 
95
 
 
 
 
96
 
97
 
98
  def restrict_src_options(model_type):
 
222
  translation = (' '.join(translation.split(' ')[1:])).strip()
223
 
224
  mt_time = time.time() - mt_start_time
 
 
225
 
 
 
 
 
226
 
227
  # Returns the text
 
228
  return transcript, translation
229
 
230
 
231
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  title = "Demo for Speech Translation (Whisper+mRASP2 and ConST)"
233
 
234
  description = """
 
250
  gr.Markdown("###" + description)
251
  with gr.Row():
252
  with gr.Column():
253
+ model_type = gr.Dropdown(['Whisper+mRASP2'], type = "value", value = 'Whisper+mRASP2', label = "Select the model you want to use.")
254
  audio_file = gr.Audio(label="Upload Speech", source="upload", type="filepath")
255
  src_language = gr.Dropdown(['Arabic',
256
  'Chinese',
 
286
  submit_button.click(fn = predict, inputs=[audio_file, src_language, tgt_language_mRASP, tgt_language_ConST, model_type, mic_audio], outputs=[transcript, translate, translated_speech])
287
  switch_lang_button.click(switchLang, [src_language, tgt_language_mRASP], [src_language, tgt_language_mRASP])
288
 
289
+ demo.launch()