Mahiruoshi commited on
Commit
cc912a2
·
1 Parent(s): 06b8201

Update text/cleaners.py

Browse files
Files changed (1) hide show
  1. text/cleaners.py +25 -10
text/cleaners.py CHANGED
@@ -64,15 +64,30 @@ def cjks_cleaners(text):
64
 
65
 
66
  def cjke_cleaners(text):
67
- text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace(
68
- 'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')+' ', text)
69
- text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace(
70
- 'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')+' ', text)
71
- text = re.sub(r'\[KO\](.*?)\[KO\]',
72
- lambda x: korean_to_ipa(x.group(1))+' ', text)
73
- text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace(
74
- 'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')+' ', text)
75
- text = re.sub(r'\s+$', '', text)
76
- text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  return text
78
 
 
64
 
65
 
66
  def cjke_cleaners(text):
67
+ chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
68
+ japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
69
+ korean_texts = re.findall(r'\[KO\].*?\[KO\]', text)
70
+ english_texts = re.findall(r'\[EN\].*?\[EN\]', text)
71
+ for chinese_text in chinese_texts:
72
+ cleaned_text = chinese_to_lazy_ipa(chinese_text[4:-4])
73
+ cleaned_text = cleaned_text.replace(
74
+ 'ʧ', '').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')
75
+ text = text.replace(chinese_text, cleaned_text+' ', 1)
76
+ for japanese_text in japanese_texts:
77
+ cleaned_text = japanese_to_ipa(japanese_text[4:-4])
78
+ cleaned_text = cleaned_text.replace('ʧ', 'tʃ').replace(
79
+ 'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')
80
+ text = text.replace(japanese_text, cleaned_text+' ', 1)
81
+ for korean_text in korean_texts:
82
+ cleaned_text = korean_to_ipa(korean_text[4:-4])
83
+ text = text.replace(korean_text, cleaned_text+' ', 1)
84
+ for english_text in english_texts:
85
+ cleaned_text = english_to_ipa2(english_text[4:-4])
86
+ cleaned_text = cleaned_text.replace('ɑ', 'a').replace(
87
+ 'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')
88
+ text = text.replace(english_text, cleaned_text+' ', 1)
89
+ text = text[:-1]
90
+ if re.match(r'[^\.,!\?\-…~]', text[-1]):
91
+ text += '.'
92
  return text
93