File size: 1,697 Bytes
cbf648c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import re
from text.japanese import japanese_to_romaji_with_accent
from text.mandarin import chinese_to_romaji
from text.english import english_to_ipa2
from text.german import german_to_ipa
from text.croatia_to_ipa import croatian_to_ipa

def cjehd_cleaners(text):
    chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
    japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
    croatian_texts = re.findall(r'\[CR\].*?\[CR\]', text)
    english_texts = re.findall(r'\[EN\].*?\[EN\]', text)
    german_texts = re.findall(r'\[DE\].*?\[DE\]', text)
    for chinese_text in chinese_texts:
        cleaned_text = chinese_to_romaji(chinese_text[4:-4])
        text = text.replace(chinese_text, cleaned_text+' ', 1)
    for japanese_text in japanese_texts:
        cleaned_text = japanese_to_romaji_with_accent(
            japanese_text[4:-4]).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…')
        text = text.replace(japanese_text, cleaned_text+' ', 1)
    for english_text in english_texts:
        cleaned_text = english_to_ipa2(english_text[4:-4])
        text = text.replace(english_text, cleaned_text+' ', 1)
    for croatian_text in croatian_texts:
        cleaned_text = croatian_to_ipa(croatian_text[4:-4])
        cleaned_text = cleaned_text.replace('ḱ','k')
        text = text.replace(croatian_text, cleaned_text + ' ', 1)
    for german_text in german_texts:
        german_text = german_text.replace('...','').replace('--','').replace('-','')
        cleaned_text = german_to_ipa(german_text[4:-4])
        text = text.replace(german_text, cleaned_text + ' ', 1)
    text = text[:-1]
    if re.match(r'[^\.,!\?\-…~]', text[-1]):
        text += '.'
    return text