|
import re |
|
import sys |
|
import typing as tp |
|
import unicodedata |
|
|
|
import torch |
|
from sacremoses import MosesPunctNormalizer |
|
from sentence_splitter import SentenceSplitter |
|
from transformers import AutoModelForSeq2SeqLM, NllbTokenizer |
|
|
|
L1 = "spa_Latn" |
|
L2 = "agr_Latn" |
|
LANGUAGES = { |
|
"Spanish | spa": L1, |
|
"Awajun | agr": L2, |
|
} |
|
|
|
def get_non_printing_char_replacer(replace_by: str = " ") -> tp.Callable[[str], str]: |
|
non_printable_map = { |
|
ord(c): replace_by |
|
for c in (chr(i) for i in range(sys.maxunicode + 1)) |
|
|
|
|
|
if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"} |
|
} |
|
|
|
def replace_non_printing_char(line) -> str: |
|
return line.translate(non_printable_map) |
|
|
|
return replace_non_printing_char |
|
|
|
class TextPreprocessor: |
|
""" |
|
Mimic the text preprocessing made for the NLLB model. |
|
This code is adapted from the Stopes repo of the NLLB team: |
|
https://github.com/facebookresearch/stopes/blob/main/stopes/pipelines/monolingual/monolingual_line_processor.py#L214 |
|
""" |
|
|
|
def __init__(self, lang="en"): |
|
self.mpn = MosesPunctNormalizer(lang=lang) |
|
self.mpn.substitutions = [ |
|
(re.compile(r), sub) for r, sub in self.mpn.substitutions |
|
] |
|
self.replace_nonprint = get_non_printing_char_replacer(" ") |
|
|
|
def __call__(self, text: str) -> str: |
|
clean = self.mpn.normalize(text) |
|
clean = self.replace_nonprint(clean) |
|
|
|
clean = unicodedata.normalize("NFKC", clean) |
|
return clean |
|
|
|
def fix_tokenizer(tokenizer, new_lang=L2): |
|
"""Add a new language token to the tokenizer vocabulary |
|
(this should be done each time after its initialization) |
|
""" |
|
old_len = len(tokenizer) - int(new_lang in tokenizer.added_tokens_encoder) |
|
tokenizer.lang_code_to_id[new_lang] = old_len - 1 |
|
tokenizer.id_to_lang_code[old_len - 1] = new_lang |
|
|
|
tokenizer.fairseq_tokens_to_ids["<mask>"] = ( |
|
len(tokenizer.sp_model) |
|
+ len(tokenizer.lang_code_to_id) |
|
+ tokenizer.fairseq_offset |
|
) |
|
|
|
tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id) |
|
tokenizer.fairseq_ids_to_tokens = { |
|
v: k for k, v in tokenizer.fairseq_tokens_to_ids.items() |
|
} |
|
if new_lang not in tokenizer._additional_special_tokens: |
|
tokenizer._additional_special_tokens.append(new_lang) |
|
|
|
tokenizer.added_tokens_encoder = {} |
|
tokenizer.added_tokens_decoder = {} |
|
|
|
def sentenize_with_fillers(text, splitter, fix_double_space=True, ignore_errors=False): |
|
"""Apply a sentence splitter and return the sentences and all separators before and after them""" |
|
if fix_double_space: |
|
text = re.sub(" +", " ", text) |
|
sentences = splitter.split(text) |
|
fillers = [] |
|
i = 0 |
|
for sentence in sentences: |
|
start_idx = text.find(sentence, i) |
|
if ignore_errors and start_idx == -1: |
|
|
|
start_idx = i + 1 |
|
assert start_idx != -1, f"sent not found after {i}: `{sentence}`" |
|
fillers.append(text[i:start_idx]) |
|
i = start_idx + len(sentence) |
|
fillers.append(text[i:]) |
|
return sentences, fillers |
|
|