File size: 1,317 Bytes
c137b2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
""" from https://github.com/keithito/tacotron """

'''
Cleaners are transformations that run over the input text at both training and eval time.

'''

import re
from unidecode import unidecode
from phonemizer import phonemize

_whitespace_re = re.compile(r'\s+')

def lowercase(text):
  return text.lower()

def collapse_whitespace(text):
  return re.sub(_whitespace_re, ' ', text)

def replace_quote(text):
    return text.replace('’', "'")

def remove_special_characters(text):
    # Define the characters to remove
    characters_to_remove = ['«', '»', '–', '[', ']', '{', '}', '|']
    # Remove the characters from the text
    for char in characters_to_remove:
        text = text.replace(char, '')
    return text

def remove_hyphen_at_start(text):
    # Check if the text starts with '-'
    if text.startswith('-'):
        # Remove the hyphen at the start
        text = text[1:].lstrip()
    return text

def basic_cleaners(text):
  '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
  text = lowercase(text)
  text = text.replace('å','å')
  text = text.replace('´', "'")
  text = text.replace('à','a')
  text = collapse_whitespace(text)
  text = replace_quote(text)
  text = remove_special_characters(text)
  text = remove_hyphen_at_start(text)
  return text