File size: 1,435 Bytes

c4adc54

import re
import regex
import unicodedata

from typing import Iterable


class BelarusianTextNormalizer:
    """
    Based on transformers.models.whisper.english_normalizer.BasicTextNormalizer
    but with support not to remove certain characters.
    e.g. apostrophe (') - a symbol from Belarusian alphabet - was removed using BasicTextNormalizer.
    """

    def __init__(self, split_letters: bool = False):
        self.split_letters = split_letters
        self.allowed_symbols = ("'",)

    @staticmethod
    def clean(s: str, allowed_symbols: Iterable[str] = None):
        """
        Replace any other markers, symbols, punctuations with a space, keeping diacritics
        """
        if allowed_symbols is None:
            allowed_symbols = []
        res = "".join(" " if unicodedata.category(c)[0] in "MSP" and c not in allowed_symbols else c 
                      for c in unicodedata.normalize("NFKC", s))
        return res

    def __call__(self, s: str):
        s = s.lower()
        s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
        s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
        s = self.clean(s, allowed_symbols=self.allowed_symbols).lower()

        if self.split_letters:
            s = " ".join(regex.findall(r"\X", s, regex.U))

        s = re.sub(r"\s+", " ", s)  # replace any successive whitespace characters with a space

        return s