ml-en-stt-model

Running

File size: 3,747 Bytes

d44849f

INDIC_NLP_LIB_HOME = "indic_nlp_library"
INDIC_NLP_RESOURCES = "indic_nlp_resources"
import sys

from indicnlp import transliterate

sys.path.append(r"{}".format(INDIC_NLP_LIB_HOME))
from indicnlp import common

common.set_resources_path(INDIC_NLP_RESOURCES)
from indicnlp import loader

loader.load()
from sacremoses import MosesPunctNormalizer
from sacremoses import MosesTokenizer
from sacremoses import MosesDetokenizer
from collections import defaultdict

import indicnlp
from indicnlp.tokenize import indic_tokenize
from indicnlp.tokenize import indic_detokenize
from indicnlp.normalize import indic_normalize
from indicnlp.transliterate import unicode_transliterate

from flores_codes_map_indic import flores_codes
import sentencepiece as spm

import re

en_detok = MosesDetokenizer(lang="en")


def postprocess(
    infname: str,
    outfname: str,
    input_size: int,
    lang: str,
    transliterate: bool = False,
    spm_model_path: str = None,
):
    """
    Postprocess the output of a machine translation model in the following order:
        - parse fairseq interactive output
        - convert script back to native Indic script (in case of Indic languages)
        - detokenize

    Args:
        infname (str): path to the input file containing the machine translation output.
        outfname (str): path to the output file where the postprocessed output will be written.
        input_size (int): number of sentences in the input file.
        lang (str): language code of the output language.
        transliterate (bool, optional): whether to transliterate the output text to devanagari (default: False).
        spm_model_path (str): path of the sentence piece model.
    """
    if spm_model_path is None:
        raise Exception("Please provide sentence piece model path for decoding")
    
    sp = spm.SentencePieceProcessor(model_file=spm_model_path)
    
    iso_lang = flores_codes[lang]

    consolidated_testoutput = []
    consolidated_testoutput = [(x, 0.0, "") for x in range(input_size)]

    temp_testoutput = []
    with open(infname, "r", encoding="utf-8") as infile:
        temp_testoutput = list(
            map(
                lambda x: x.strip().split("\t"),
                filter(lambda x: x.startswith("H-"), infile),
            )
        )
        temp_testoutput = list(
            map(lambda x: (int(x[0].split("-")[1]), float(x[1]), x[2]), temp_testoutput)
        )
        for sid, score, hyp in temp_testoutput:
            consolidated_testoutput[sid] = (sid, score, hyp)
        consolidated_testoutput = [x[2] for x in consolidated_testoutput]
        consolidated_testoutput = [sp.decode(x.split(" ")) for x in consolidated_testoutput]

    if iso_lang == "en":
        with open(outfname, "w", encoding="utf-8") as outfile:
            for sent in consolidated_testoutput:
                outfile.write(en_detok.detokenize(sent.split(" ")) + "\n")
    else:
        xliterator = unicode_transliterate.UnicodeIndicTransliterator()
        with open(outfname, "w", encoding="utf-8") as outfile:
            for sent in consolidated_testoutput:
                if transliterate:
                    outstr = indic_detokenize.trivial_detokenize(
                        xliterator.transliterate(sent, "hi", iso_lang), iso_lang
                    )
                else:
                    outstr = indic_detokenize.trivial_detokenize(sent, iso_lang)
                outfile.write(outstr + "\n")


if __name__ == "__main__":
    infname = sys.argv[1]
    outfname = sys.argv[2]
    input_size = int(sys.argv[3])
    lang = sys.argv[4]
    transliterate = sys.argv[5]
    spm_model_path = sys.argv[6]

    postprocess(infname, outfname, input_size, lang, transliterate, spm_model_path)