Pipe1213's picture
Update text/cleaners.py
4e392aa verified
""" from https://github.com/keithito/tacotron """
'''
Cleaners are transformations that run over the input text at both training and eval time.
'''
import re
from unidecode import unidecode
from phonemizer import phonemize
_whitespace_re = re.compile(r'\s+')
def lowercase(text):
return text.lower()
def collapse_whitespace(text):
return re.sub(_whitespace_re, ' ', text)
def replace_quote(text):
return text.replace('’', "'")
def remove_special_characters(text):
# Define the characters to remove
characters_to_remove = ['«', '»', '–', '[', ']', '{', '}', '|']
# Remove the characters from the text
for char in characters_to_remove:
text = text.replace(char, '')
return text
def remove_hyphen_at_start(text):
# Check if the text starts with '-'
if text.startswith('-'):
# Remove the hyphen at the start
text = text[1:].lstrip()
return text
def basic_cleaners(text):
'''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
text = lowercase(text)
text = text.replace('å','å')
text = text.replace('´', "'")
text = text.replace('à','a')
text = text.replace('qu','K')
text = text.replace('Qu','K')
text = text.replace('gngn','djn')
text = text.replace('GNGN','djn')
text = text.replace('djdj','dj')
text = text.replace('qw','kw')
text = text.replace('emb','anb')
text = text.replace('emp','anp')
text = text.replace('eû', 'eu')
text = text.replace('au', 'å')
text = text.replace('t′', 'te')
text = text.replace('s′', 'sse')
text = text.replace('aî','ai')
text = collapse_whitespace(text)
text = replace_quote(text)
text = remove_special_characters(text)
text = remove_hyphen_at_start(text)
return text