|
import re |
|
import regex |
|
import unicodedata |
|
|
|
from typing import Iterable |
|
|
|
|
|
class BelarusianTextNormalizer: |
|
""" |
|
Based on transformers.models.whisper.english_normalizer.BasicTextNormalizer |
|
but with support not to remove certain characters. |
|
e.g. apostrophe (') - a symbol from Belarusian alphabet - was removed using BasicTextNormalizer. |
|
""" |
|
|
|
def __init__(self, split_letters: bool = False): |
|
self.split_letters = split_letters |
|
self.allowed_symbols = ("'",) |
|
|
|
@staticmethod |
|
def clean(s: str, allowed_symbols: Iterable[str] = None): |
|
""" |
|
Replace any other markers, symbols, punctuations with a space, keeping diacritics |
|
""" |
|
if allowed_symbols is None: |
|
allowed_symbols = [] |
|
res = "".join(" " if unicodedata.category(c)[0] in "MSP" and c not in allowed_symbols else c |
|
for c in unicodedata.normalize("NFKC", s)) |
|
return res |
|
|
|
def __call__(self, s: str): |
|
s = s.lower() |
|
s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) |
|
s = re.sub(r"\(([^)]+?)\)", "", s) |
|
s = self.clean(s, allowed_symbols=self.allowed_symbols).lower() |
|
|
|
if self.split_letters: |
|
s = " ".join(regex.findall(r"\X", s, regex.U)) |
|
|
|
s = re.sub(r"\s+", " ", s) |
|
|
|
return s |
|
|