|
INDIC_NLP_LIB_HOME = "indic_nlp_library" |
|
INDIC_NLP_RESOURCES = "indic_nlp_resources" |
|
import sys |
|
|
|
sys.path.append(r"{}".format(INDIC_NLP_LIB_HOME)) |
|
from indicnlp import common |
|
|
|
common.set_resources_path(INDIC_NLP_RESOURCES) |
|
from indicnlp import loader |
|
|
|
loader.load() |
|
from sacremoses import MosesPunctNormalizer |
|
from sacremoses import MosesTokenizer |
|
from sacremoses import MosesDetokenizer |
|
from collections import defaultdict |
|
|
|
from tqdm import tqdm |
|
from joblib import Parallel, delayed |
|
|
|
from indicnlp.tokenize import indic_tokenize |
|
from indicnlp.tokenize import indic_detokenize |
|
from indicnlp.normalize import indic_normalize |
|
from indicnlp.transliterate import unicode_transliterate |
|
|
|
import re |
|
from typing import Union |
|
from flores_codes_map_indic import flores_codes |
|
|
|
en_tok = MosesTokenizer(lang="en") |
|
en_normalizer = MosesPunctNormalizer() |
|
|
|
|
|
def preprocess_line( |
|
line: str, |
|
normalizer: Union[MosesPunctNormalizer, indic_normalize.IndicNormalizerFactory], |
|
lang: str, |
|
transliterate: bool = False, |
|
remove_tag: bool = True |
|
) -> str: |
|
""" |
|
Preprocess a line of text by normalizing, tokenization, and possibly transliterating it. |
|
|
|
Args: |
|
line (str): the line of text to preprocess. |
|
normalizer (Union[MosesPunctNormalizer, indic_normalize.IndicNormalizerFactory]): an object that performs normalization on the text. |
|
lang (str): the language of the line of text |
|
transliterate (bool, optional): whether to transliterate the line of text to devanagari (default: False). |
|
remove_tag (bool, optional): whether to remove the do not translate tags (`<dnt>` and `</dnt>`) from the line of text (default: True). |
|
|
|
Returns: |
|
str: preprocessed line of text. |
|
""" |
|
iso_lang = flores_codes[lang] |
|
|
|
pattern = r'<dnt>(.*?)</dnt>' |
|
raw_matches = re.findall(pattern, line) |
|
|
|
if iso_lang == "en": |
|
processed_line = " ".join(en_tok.tokenize(en_normalizer.normalize(line.strip()), escape=False)) |
|
elif transliterate: |
|
|
|
|
|
|
|
processed_line = unicode_transliterate.UnicodeIndicTransliterator.transliterate( |
|
" ".join(indic_tokenize.trivial_tokenize(normalizer.normalize(line.strip()), iso_lang)), |
|
iso_lang, |
|
"hi", |
|
).replace(" ् ", "्") |
|
else: |
|
|
|
processed_line = " ".join( |
|
indic_tokenize.trivial_tokenize(normalizer.normalize(line.strip()), iso_lang) |
|
) |
|
|
|
processed_line = processed_line.replace("< dnt >", "<dnt>") |
|
processed_line = processed_line.replace("< / dnt >", "</dnt>") |
|
|
|
processed_line_matches = re.findall(pattern, processed_line) |
|
for raw_match, processed_line_match in zip(raw_matches, processed_line_matches): |
|
processed_line = processed_line.replace(processed_line_match, raw_match) |
|
|
|
if remove_tag: |
|
processed_line = re.sub("\s+", " ", processed_line.replace("<dnt>", " ")).strip() |
|
processed_line = re.sub("\s+", " ", processed_line.replace("</dnt>", " ")).strip() |
|
|
|
return processed_line |
|
|
|
|
|
def preprocess( |
|
infname: str, |
|
outfname: str, |
|
lang: str, |
|
transliterate: bool = False, |
|
remove_tag: bool= True |
|
) -> int: |
|
""" |
|
Preprocess the text in the input file by normalizing, tokenizing and |
|
script conversation and write the output to a new file. |
|
|
|
Args: |
|
infname (str): path of the input file. |
|
outfname (str): path of the output file. |
|
lang (str): language of the text in the input file. |
|
transliterate (bool, optional): whether to transliterate the text in input file to devanagari (default: False). |
|
remove_tag (bool, optional): whether to remove the do not translate tags (`<dnt>` and `</dnt>`) from the text in input file (default: True). |
|
|
|
Returns: |
|
int: number of sentences in the input file |
|
""" |
|
iso_lang = flores_codes[lang] |
|
|
|
n = 0 |
|
num_lines = sum(1 for line in open(infname, "r")) |
|
|
|
if iso_lang == "en": |
|
with open(infname, "r", encoding="utf-8") as infile, open( |
|
outfname, "w", encoding="utf-8" |
|
) as outfile: |
|
|
|
out_lines = Parallel(n_jobs=-1, backend="multiprocessing")( |
|
delayed(preprocess_line)(line, None, lang, transliterate, remove_tag) for line in tqdm(infile, total=num_lines) |
|
) |
|
|
|
for line in out_lines: |
|
outfile.write(line + "\n") |
|
n += 1 |
|
else: |
|
normfactory = indic_normalize.IndicNormalizerFactory() |
|
normalizer = normfactory.get_normalizer(iso_lang) |
|
|
|
with open(infname, "r", encoding="utf-8") as infile, open( |
|
outfname, "w", encoding="utf-8" |
|
) as outfile: |
|
|
|
out_lines = Parallel(n_jobs=-1, backend="multiprocessing")( |
|
delayed(preprocess_line)(line, normalizer, lang, transliterate, remove_tag) |
|
for line in tqdm(infile, total=num_lines) |
|
) |
|
|
|
for line in out_lines: |
|
outfile.write(line + "\n") |
|
n += 1 |
|
|
|
return n |
|
|
|
|
|
if __name__ == "__main__": |
|
infname = sys.argv[1] |
|
outfname = sys.argv[2] |
|
lang = sys.argv[3] |
|
transliterate = sys.argv[4] |
|
remove_tag = sys.argv[5] |
|
|
|
if transliterate.lower() == "true": |
|
transliterate = True |
|
else: |
|
transliterate = False |
|
|
|
if remove_tag.lower() == "true": |
|
remove_tag = True |
|
else: |
|
remove_tag = False |
|
|
|
print(preprocess(infname, outfname, lang, transliterate, remove_tag)) |
|
|