Spaces:
Build error
Build error
# -*- coding: utf-8 -*- | |
""" | |
@Author : Rong Ye | |
@Time : May 2022 | |
@Contact : yerong@bytedance | |
@Description: | |
""" | |
import os | |
import shutil | |
import yaml | |
import torchaudio | |
import gradio as gr | |
from huggingface_hub import snapshot_download | |
LANGUAGE_CODES = { | |
"German": "de", | |
"Spanish": "es", | |
"French": "fr", | |
"Italian": "it", | |
"Netherlands": "nl", | |
"Portuguese": "pt", | |
"Romanian": "ro", | |
"Russian": "ru", | |
} | |
LANG_GEN_SETUPS = { | |
"de": {"beam": 10, "lenpen": 0.7}, | |
"es": {"beam": 10, "lenpen": 0.7}, | |
"fr": {"beam": 10, "lenpen": 0.7}, | |
"it": {"beam": 10, "lenpen": 0.7}, | |
"nl": {"beam": 10, "lenpen": 0.7}, | |
"pt": {"beam": 10, "lenpen": 0.7}, | |
"ro": {"beam": 10, "lenpen": 0.7}, | |
"ru": {"beam": 10, "lenpen": 0.1}, | |
} | |
os.system("git clone https://github.com/ReneeYe/ConST") | |
os.system('mv ConST/* ./') | |
os.system("python3 setup.py install") | |
os.system("python3 setup.py build_ext --inplace") | |
os.system("mkdir -p data checkpoint") | |
huggingface_model_dir = snapshot_download(repo_id="ReneeYe/ConST_en2x_models") | |
print(huggingface_model_dir) | |
def convert_audio_to_16k_wav(audio_input): | |
num_frames = torchaudio.info(audio_input.name).num_frames | |
filename = audio_input.name.split("/")[-1] | |
shutil.copy(audio_input.name, f'data/{filename}') | |
return f'data/{filename}', num_frames | |
def prepare_tsv(file_name, n_frame, language, task="ST"): | |
tgt_lang = LANGUAGE_CODES[language] | |
with open("data/test_case.tsv", "w") as f: | |
f.write("id\taudio\tn_frames\ttgt_text\tspeaker\tsrc_lang\ttgt_lang\tsrc_text\n") | |
f.write(f"sample\t{file_name}\t{n_frame}\tThis is in {tgt_lang}.\tspk.1\ten\t{tgt_lang}\tThis is English.\n") | |
def get_vocab_and_yaml(language): | |
tgt_lang = LANGUAGE_CODES[language] | |
# get: spm_ende.model and spm_ende.txt, and save to data/xxx | |
# if exist, no need to download | |
shutil.copy(os.path.join(huggingface_model_dir, f"vocabulary/spm_en{tgt_lang}.model"), "./data") | |
shutil.copy(os.path.join(huggingface_model_dir, f"vocabulary/spm_en{tgt_lang}.txt"), "./data") | |
# write yaml file | |
abs_path = os.popen("pwd").read().strip() | |
yaml_dict = LANG_GEN_SETUPS["tgt_lang"] | |
yaml_dict["input_channels"] = 1 | |
yaml_dict["use_audio_input"] = True | |
yaml_dict["prepend_tgt_lang_tag"] = True | |
yaml_dict["prepend_src_lang_tag"] = True | |
yaml_dict["audio_root"] = os.path.join(abs_path, "data") | |
yaml_dict["vocab_filename"] = f"spm_en{tgt_lang}.txt" | |
yaml_dict["bpe_tokenizer"] = {"bpe": "sentencepiece", | |
"sentencepiece_model": os.path.join(abs_path, f"data/spm_en{tgt_lang}.model")} | |
with open("data/config.yaml", "w") as f: | |
yaml.dump(yaml_dict, f) | |
def get_model(language): | |
# download models to checkpoint/xxx | |
return os.path.join(huggingface_model_dir, f"models/const_en{LANGUAGE_CODES[language]}.pt") | |
def generate(model_path): | |
os.system(f"fairseq-generate data/ --gen-subset test_case --task speech_to_text --prefix-size 1 \ | |
--max-tokens 4000000 --max-source-positions 4000000 \ | |
--config-yaml config.yaml --path {model_path} | tee temp.txt") | |
output = os.popen("grep ^D temp.txt | sort -n -k 2 -t '-' | cut -f 3") | |
return output.read().strip() | |
def remove_temp_files(): | |
os.remove("temp.txt") | |
os.remove("data/test_case.tsv") | |
def run(audio_file, language): | |
converted_audio_file, n_frame = convert_audio_to_16k_wav(audio_file) | |
prepare_tsv(converted_audio_file, n_frame, language) | |
get_vocab_and_yaml(language) | |
model_path = get_model(language) | |
generated_output = generate(model_path) | |
remove_temp_files() | |
return generated_output | |
def greet(audio_file, language): | |
print(audio_file.name) | |
return f"Hello {language}!!" | |
inputs = [ | |
gr.inputs.Audio(source="microphone", type="file", label="Record something (in English)..."), | |
gr.inputs.Dropdown(list(LANGUAGE_CODES.keys()), default="German", label="From English to Languages X..."), | |
] | |
iface = gr.Interface( | |
fn=run, | |
inputs=inputs, | |
outputs=[gr.outputs.Textbox(label="The translation")], | |
examples=[['case1.wav', "German"],['case2.wav', "German"], ['case3.wav', "German"]], | |
title="ConST: an end-to-end speech translator", | |
description="End-to-end Speech Translation Live Demo for English to eight European languages.", | |
article="ConST is an end-to-end speech translation model (see paper <a href='https://arxiv.org/abs/2205.02444', target='_blank'>here</a>). " | |
"Its motivation is to use contrastive learning method to learn similar representations for semantically similar speech and text.", | |
theme="seafoam", | |
layout='vertical', | |
# analytics_enabled=False, | |
# flagging_dir='results/flagged/', | |
# allow_flagging=True, | |
# flagging_options=['Interesting!', 'Error: Claim Phrase Parsing', 'Error: Local Premise', | |
# 'Error: Require Commonsense', 'Error: Evidence Retrieval'], | |
enable_queue=True | |
) | |
iface.launch(inline=False) |