INDIC_NLP_LIB_HOME = "indic_nlp_library" INDIC_NLP_RESOURCES = "indic_nlp_resources" import sys from indicnlp import transliterate sys.path.append(r"{}".format(INDIC_NLP_LIB_HOME)) from indicnlp import common common.set_resources_path(INDIC_NLP_RESOURCES) from indicnlp import loader loader.load() from sacremoses import MosesPunctNormalizer from sacremoses import MosesTokenizer from sacremoses import MosesDetokenizer from collections import defaultdict import indicnlp from indicnlp.tokenize import indic_tokenize from indicnlp.tokenize import indic_detokenize from indicnlp.normalize import indic_normalize from indicnlp.transliterate import unicode_transliterate from flores_codes_map_indic import flores_codes import sentencepiece as spm import re en_detok = MosesDetokenizer(lang="en") def postprocess( infname: str, outfname: str, input_size: int, lang: str, transliterate: bool = False, spm_model_path: str = None, ): """ Postprocess the output of a machine translation model in the following order: - parse fairseq interactive output - convert script back to native Indic script (in case of Indic languages) - detokenize Args: infname (str): path to the input file containing the machine translation output. outfname (str): path to the output file where the postprocessed output will be written. input_size (int): number of sentences in the input file. lang (str): language code of the output language. transliterate (bool, optional): whether to transliterate the output text to devanagari (default: False). spm_model_path (str): path of the sentence piece model. """ if spm_model_path is None: raise Exception("Please provide sentence piece model path for decoding") sp = spm.SentencePieceProcessor(model_file=spm_model_path) iso_lang = flores_codes[lang] consolidated_testoutput = [] consolidated_testoutput = [(x, 0.0, "") for x in range(input_size)] temp_testoutput = [] with open(infname, "r", encoding="utf-8") as infile: temp_testoutput = list( map( lambda x: x.strip().split("\t"), filter(lambda x: x.startswith("H-"), infile), ) ) temp_testoutput = list( map(lambda x: (int(x[0].split("-")[1]), float(x[1]), x[2]), temp_testoutput) ) for sid, score, hyp in temp_testoutput: consolidated_testoutput[sid] = (sid, score, hyp) consolidated_testoutput = [x[2] for x in consolidated_testoutput] consolidated_testoutput = [sp.decode(x.split(" ")) for x in consolidated_testoutput] if iso_lang == "en": with open(outfname, "w", encoding="utf-8") as outfile: for sent in consolidated_testoutput: outfile.write(en_detok.detokenize(sent.split(" ")) + "\n") else: xliterator = unicode_transliterate.UnicodeIndicTransliterator() with open(outfname, "w", encoding="utf-8") as outfile: for sent in consolidated_testoutput: if transliterate: outstr = indic_detokenize.trivial_detokenize( xliterator.transliterate(sent, "hi", iso_lang), iso_lang ) else: outstr = indic_detokenize.trivial_detokenize(sent, iso_lang) outfile.write(outstr + "\n") if __name__ == "__main__": infname = sys.argv[1] outfname = sys.argv[2] input_size = int(sys.argv[3]) lang = sys.argv[4] transliterate = sys.argv[5] spm_model_path = sys.argv[6] postprocess(infname, outfname, input_size, lang, transliterate, spm_model_path)