Spaces:
Running
Running
import os | |
import sys | |
import glob | |
from tqdm import tqdm | |
from google.cloud import translate | |
# Expects a json file containing the API credentials. | |
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.join( | |
os.path.dirname(__file__), r"api_key.json" | |
) | |
flores_to_iso = { | |
"asm_Beng": "as", | |
"ben_Beng": "bn", | |
"doi_Deva": "doi", | |
"eng_Latn": "en", | |
"gom_Deva": "gom", | |
"guj_Gujr": "gu", | |
"hin_Deva": "hi", | |
"kan_Knda": "kn", | |
"mai_Deva": "mai", | |
"mal_Mlym": "ml", | |
"mar_Deva": "mr", | |
"mni_Mtei": "mni_Mtei", | |
"npi_Deva": "ne", | |
"ory_Orya": "or", | |
"pan_Guru": "pa", | |
"san_Deva": "sa", | |
"sat_Olck": "sat", | |
"snd_Arab": "sd", | |
"tam_Taml": "ta", | |
"tel_Telu": "te", | |
"urd_Arab": "ur", | |
} | |
# Copy the project id from the json file containing API credentials | |
def translate_text(text, src_lang, tgt_lang, project_id="project_id"): | |
src_lang = flores_to_iso[src_lang] | |
tgt_lang = flores_to_iso[tgt_lang] | |
if src_lang == "mni_Mtei": | |
src_lang = "mni-Mtei" | |
if tgt_lang == "mni_Mtei": | |
tgt_lang = "mni-Mtei" | |
client = translate.TranslationServiceClient() | |
location = "global" | |
parent = f"projects/{project_id}/locations/{location}" | |
response = client.translate_text( | |
request={ | |
"parent": parent, | |
"contents": [text], | |
"mime_type": "text/plain", # mime types: text/plain, text/html | |
"source_language_code": src_lang, | |
"target_language_code": tgt_lang, | |
} | |
) | |
translated_text = "" | |
for translation in response.translations: | |
translated_text += translation.translated_text | |
return translated_text | |
if __name__ == "__main__": | |
root_dir = sys.argv[1] | |
pairs = sorted(glob.glob(os.path.join(root_dir, "*"))) | |
for pair in pairs: | |
print(pair) | |
basename = os.path.basename(pair) | |
src_lang, tgt_lang = basename.split("-") | |
if src_lang not in flores_to_iso.keys() or tgt_lang not in flores_to_iso.keys(): | |
continue | |
if src_lang == "eng_Latn": | |
lang = tgt_lang | |
else: | |
lang = src_lang | |
lang = flores_to_iso[lang] | |
if lang not in "as bn doi gom gu hi kn mai ml mni_Mtei mr ne or pa sa sd ta te ur": | |
continue | |
print(f"{src_lang} - {tgt_lang}") | |
# source to target translations | |
src_infname = os.path.join(pair, f"test.{src_lang}") | |
tgt_outfname = os.path.join(pair, f"test.{tgt_lang}.pred.google") | |
if os.path.exists(src_infname) and not os.path.exists(tgt_outfname): | |
src_sents = [ | |
sent.replace("\n", "").strip() | |
for sent in open(src_infname, "r").read().split("\n") | |
if sent | |
] | |
translations = [ | |
translate_text(text, src_lang, tgt_lang).strip() for text in tqdm(src_sents) | |
] | |
with open(tgt_outfname, "w") as f: | |
f.write("\n".join(translations)) | |
# # target to source translations | |
tgt_infname = os.path.join(pair, f"test.{tgt_lang}") | |
src_outfname = os.path.join(pair, f"test.{src_lang}.pred.google") | |
if os.path.exists(tgt_infname) and not os.path.exists(src_outfname): | |
tgt_sents = [ | |
sent.replace("\n", "").strip() | |
for sent in open(tgt_infname, "r").read().split("\n") | |
if sent | |
] | |
translations = [ | |
translate_text(text, tgt_lang, src_lang).strip() for text in tqdm(tgt_sents) | |
] | |
with open(src_outfname, "w") as f: | |
f.write("\n".join(translations)) | |