from typing import Tuple, List import regex as re import sys from tqdm import tqdm from joblib import Parallel, delayed from indic_num_map import INDIC_NUM_MAP URL_PATTERN = r'\b(? Tuple[str, str]: """ Wraps all occurences of a given pattern match that are present in both source and target sentences with a do not translate tags (`` {input string} ``). This will be particularly useful when some span of input string needs to be forwarded as it is and not translated. Args: src (str): source sentence. tgt (str): target sentence. pattern (str): pattern to search for in the source and target sentence. Returns: Tuple[str, str]: A tuple containing source and target sentences where source sentences are wrapped in `` and `` tags in case of pattern matches. """ # find matches in src and tgt sentence src_matches = set(re.findall(pattern, src)) tgt_matches = set(re.findall(pattern, tgt)) # find matches that are present in both src and tgt common_matches = src_matches.intersection(tgt_matches) # wrap common match with and tag for match in common_matches: src = src.replace(match, f' {match} ') tgt = tgt.replace(match, f' {match} ') src = re.sub("\s+", " ", src) tgt = re.sub("\s+", " ", tgt) return src, tgt def normalize(src_line: str, tgt_line: str, patterns: List[str]) -> Tuple[str, str]: """ Normalizes and wraps the spans of text that are present in both source and target sentence with `` and `` tags. It first normalizes the Indic numerals in the input string to Roman script. Later, it uses the source and target sentence with normalized Indic numerals to wrap the spans of source sentence matching the pattern with `` and `` tags. Args: src_line (str): source sentence. tgt_line (str): source sentence. pattern (List[str]): list of patterns to search for in the input string. Returns: Tuple[str, str]: A tuple containing source and target sentences where source sentences are wrapped in `` and `` tags in case of pattern matches. """ src_line = normalize_indic_numerals(src_line.strip("\n")) tgt_line = normalize_indic_numerals(tgt_line.strip("\n")) for pattern in patterns: src_line, tgt_line = wrap_with_dnt_tag(src_line, tgt_line, pattern) return src_line, tgt_line if __name__ == "__main__": src_infname = sys.argv[1] tgt_infname = sys.argv[2] src_outfname = sys.argv[3] tgt_outfname = sys.argv[4] num_lines = sum(1 for line in open(src_infname, "r")) patterns = [EMAIL_PATTERN, URL_PATTERN, NUMERAL_PATTERN, OTHER_PATTERN] with open(src_infname, "r", encoding="utf-8") as src_infile, \ open(tgt_infname, "r", encoding="utf-8") as tgt_infile, \ open(src_outfname, "w", encoding="utf-8") as src_outfile, \ open(tgt_outfname, "w", encoding="utf-8") as tgt_outfile: out_lines = Parallel(n_jobs=-1, backend="multiprocessing")( delayed(normalize)(src_line, tgt_line, patterns) for src_line, tgt_line in tqdm(zip(src_infile, tgt_infile), total=num_lines) ) for src_line, tgt_line in tqdm(out_lines): src_outfile.write(src_line + "\n") tgt_outfile.write(tgt_line + "\n")