File size: 1,435 Bytes
c4adc54 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
import re
import regex
import unicodedata
from typing import Iterable
class BelarusianTextNormalizer:
"""
Based on transformers.models.whisper.english_normalizer.BasicTextNormalizer
but with support not to remove certain characters.
e.g. apostrophe (') - a symbol from Belarusian alphabet - was removed using BasicTextNormalizer.
"""
def __init__(self, split_letters: bool = False):
self.split_letters = split_letters
self.allowed_symbols = ("'",)
@staticmethod
def clean(s: str, allowed_symbols: Iterable[str] = None):
"""
Replace any other markers, symbols, punctuations with a space, keeping diacritics
"""
if allowed_symbols is None:
allowed_symbols = []
res = "".join(" " if unicodedata.category(c)[0] in "MSP" and c not in allowed_symbols else c
for c in unicodedata.normalize("NFKC", s))
return res
def __call__(self, s: str):
s = s.lower()
s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets
s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis
s = self.clean(s, allowed_symbols=self.allowed_symbols).lower()
if self.split_letters:
s = " ".join(regex.findall(r"\X", s, regex.U))
s = re.sub(r"\s+", " ", s) # replace any successive whitespace characters with a space
return s
|