from text.symbols import symbols, DOUBLING_TOKEN, EOS_TOKEN, SEPARATOR_TOKEN from text.phonetise_buckwalter import ( arabic_to_buckwalter, buckwalter_to_arabic, process_utterance ) vowels = ['aa', 'AA', 'uu0', 'uu1', 'UU0', 'UU1', 'ii0', 'ii1', 'II0', 'II1', 'a', 'A', 'u0', 'u1', 'U0', 'U1', 'i0', 'i1', 'I0', 'I1'] vowel_map = { 'aa': 'aa', 'AA': 'aa', 'uu0': 'uu', 'uu1': 'uu', 'UU0': 'uu', 'UU1': 'uu', 'ii0': 'ii', 'ii1': 'ii', 'II0': 'ii', 'II1': 'ii', 'a': 'a', 'A': 'a', 'u0': 'u', 'u1': 'u', 'U0': 'u', 'U1': 'u', 'i0': 'i', 'i1': 'i', 'I0': 'i', 'I1': 'i' } phon_to_id_ = {phon: i for i, phon in enumerate(symbols)} def tokens_to_ids(phonemes, phon_to_id=None): if phon_to_id is None: return [phon_to_id_[phon] for phon in phonemes] return [phon_to_id[phon] for phon in phonemes] def ids_to_tokens(ids): return [symbols[id] for id in ids] def arabic_to_phonemes(arabic): buckw = arabic_to_buckwalter(arabic) return process_utterance(buckw) def buckwalter_to_phonemes(buckw): return process_utterance(buckw) def phonemes_to_tokens(phonemes: str, append_space=True): phonemes = phonemes \ .replace("sil", "") \ .replace("+", "_+_") \ .split() for i, phon in enumerate(phonemes): if len(phon) == 2 and phon not in vowels and phon[0] == phon[1]: phonemes[i] = phon[0] phonemes.insert(i+1, DOUBLING_TOKEN) if phonemes[i] in vowels: phonemes[i] = vowel_map[phonemes[i]] if append_space: phonemes.append(SEPARATOR_TOKEN) phonemes.append(EOS_TOKEN) return phonemes def buckwalter_to_tokens(buckw, append_space=True): phonemes = buckwalter_to_phonemes(buckw) tokens = phonemes_to_tokens(phonemes, append_space=append_space) return tokens def arabic_to_tokens(arabic, append_space=True): buckw = arabic_to_buckwalter(arabic) tokens = buckwalter_to_tokens(buckw, append_space=append_space) return tokens def simplify_phonemes(phonemes): for k, v in vowel_map.items(): phonemes = phonemes.replace(k, v) return phonemes