Spaces:
Running
Running
"""Set of default text cleaners""" | |
# TODO: pick the cleaner for languages dynamically | |
import re | |
from anyascii import anyascii | |
from TTS.tts.utils.text.chinese_mandarin.numbers import replace_numbers_to_characters_in_text | |
from .english.abbreviations import abbreviations_en | |
from .english.number_norm import normalize_numbers as en_normalize_numbers | |
from .english.time_norm import expand_time_english | |
from .french.abbreviations import abbreviations_fr | |
# Regular expression matching whitespace: | |
_whitespace_re = re.compile(r"\s+") | |
def expand_abbreviations(text, lang="en"): | |
if lang == "en": | |
_abbreviations = abbreviations_en | |
elif lang == "fr": | |
_abbreviations = abbreviations_fr | |
for regex, replacement in _abbreviations: | |
text = re.sub(regex, replacement, text) | |
return text | |
def lowercase(text): | |
return text.lower() | |
def collapse_whitespace(text): | |
return re.sub(_whitespace_re, " ", text).strip() | |
def convert_to_ascii(text): | |
return anyascii(text) | |
def remove_aux_symbols(text): | |
text = re.sub(r"[\<\>\(\)\[\]\"]+", "", text) | |
return text | |
def replace_symbols(text, lang="en"): | |
"""Replace symbols based on the lenguage tag. | |
Args: | |
text: | |
Input text. | |
lang: | |
Lenguage identifier. ex: "en", "fr", "pt", "ca". | |
Returns: | |
The modified text | |
example: | |
input args: | |
text: "si l'avi cau, diguem-ho" | |
lang: "ca" | |
Output: | |
text: "si lavi cau, diguemho" | |
""" | |
text = text.replace(";", ",") | |
text = text.replace("-", " ") if lang != "ca" else text.replace("-", "") | |
text = text.replace(":", ",") | |
if lang == "en": | |
text = text.replace("&", " and ") | |
elif lang == "fr": | |
text = text.replace("&", " et ") | |
elif lang == "pt": | |
text = text.replace("&", " e ") | |
elif lang == "ca": | |
text = text.replace("&", " i ") | |
text = text.replace("'", "") | |
return text | |
def basic_cleaners(text): | |
"""Basic pipeline that lowercases and collapses whitespace without transliteration.""" | |
text = lowercase(text) | |
text = collapse_whitespace(text) | |
return text | |
def transliteration_cleaners(text): | |
"""Pipeline for non-English text that transliterates to ASCII.""" | |
# text = convert_to_ascii(text) | |
text = lowercase(text) | |
text = collapse_whitespace(text) | |
return text | |
def basic_german_cleaners(text): | |
"""Pipeline for German text""" | |
text = lowercase(text) | |
text = collapse_whitespace(text) | |
return text | |
# TODO: elaborate it | |
def basic_turkish_cleaners(text): | |
"""Pipeline for Turkish text""" | |
text = text.replace("I", "ı") | |
text = lowercase(text) | |
text = collapse_whitespace(text) | |
return text | |
def english_cleaners(text): | |
"""Pipeline for English text, including number and abbreviation expansion.""" | |
# text = convert_to_ascii(text) | |
text = lowercase(text) | |
text = expand_time_english(text) | |
text = en_normalize_numbers(text) | |
text = expand_abbreviations(text) | |
text = replace_symbols(text) | |
text = remove_aux_symbols(text) | |
text = collapse_whitespace(text) | |
return text | |
def phoneme_cleaners(text): | |
"""Pipeline for phonemes mode, including number and abbreviation expansion.""" | |
text = en_normalize_numbers(text) | |
text = expand_abbreviations(text) | |
text = replace_symbols(text) | |
text = remove_aux_symbols(text) | |
text = collapse_whitespace(text) | |
return text | |
def french_cleaners(text): | |
"""Pipeline for French text. There is no need to expand numbers, phonemizer already does that""" | |
text = expand_abbreviations(text, lang="fr") | |
text = lowercase(text) | |
text = replace_symbols(text, lang="fr") | |
text = remove_aux_symbols(text) | |
text = collapse_whitespace(text) | |
return text | |
def portuguese_cleaners(text): | |
"""Basic pipeline for Portuguese text. There is no need to expand abbreviation and | |
numbers, phonemizer already does that""" | |
text = lowercase(text) | |
text = replace_symbols(text, lang="pt") | |
text = remove_aux_symbols(text) | |
text = collapse_whitespace(text) | |
return text | |
def chinese_mandarin_cleaners(text: str) -> str: | |
"""Basic pipeline for chinese""" | |
text = replace_numbers_to_characters_in_text(text) | |
return text | |
def multilingual_cleaners(text): | |
"""Pipeline for multilingual text""" | |
text = lowercase(text) | |
text = replace_symbols(text, lang=None) | |
text = remove_aux_symbols(text) | |
text = collapse_whitespace(text) | |
return text | |
def no_cleaners(text): | |
# remove newline characters | |
text = text.replace("\n", "") | |
return text | |