""" from https://github.com/keithito/tacotron """ ''' Cleaners are transformations that run over the input text at both training and eval time. ''' import re from unidecode import unidecode from phonemizer import phonemize _whitespace_re = re.compile(r'\s+') def lowercase(text): return text.lower() def collapse_whitespace(text): return re.sub(_whitespace_re, ' ', text) def replace_quote(text): return text.replace('’', "'") def remove_special_characters(text): # Define the characters to remove characters_to_remove = ['«', '»', '–', '[', ']', '{', '}', '|'] # Remove the characters from the text for char in characters_to_remove: text = text.replace(char, '') return text def remove_hyphen_at_start(text): # Check if the text starts with '-' if text.startswith('-'): # Remove the hyphen at the start text = text[1:].lstrip() return text def basic_cleaners(text): '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' text = lowercase(text) text = text.replace('å','å') text = text.replace('´', "'") text = text.replace('à','a') text = collapse_whitespace(text) text = replace_quote(text) text = remove_special_characters(text) text = remove_hyphen_at_start(text) return text