Spaces:

wasmdashai
/

wasm-ara

Sleeping

File size: 2,174 Bytes
from text.symbols import symbols, DOUBLING_TOKEN, EOS_TOKEN, SEPARATOR_TOKEN
from text.phonetise_buckwalter import (
    arabic_to_buckwalter,
    buckwalter_to_arabic,
    process_utterance
)

vowels = ['aa', 'AA', 'uu0', 'uu1', 'UU0', 'UU1', 'ii0', 'ii1',
          'II0', 'II1', 'a', 'A', 'u0', 'u1', 'U0', 'U1', 'i0', 'i1',
          'I0', 'I1']

vowel_map = {
    'aa': 'aa', 'AA': 'aa',
    'uu0': 'uu', 'uu1': 'uu', 'UU0': 'uu', 'UU1': 'uu',
    'ii0': 'ii', 'ii1': 'ii', 'II0': 'ii', 'II1': 'ii',
    'a': 'a', 'A': 'a',
    'u0': 'u', 'u1': 'u', 'U0': 'u', 'U1': 'u',
    'i0': 'i', 'i1': 'i', 'I0': 'i', 'I1': 'i'
}

phon_to_id_ = {phon: i for i, phon in enumerate(symbols)}


def tokens_to_ids(phonemes, phon_to_id=None):
    if phon_to_id is None:
        return [phon_to_id_[phon] for phon in phonemes]
    return [phon_to_id[phon] for phon in phonemes]


def ids_to_tokens(ids):
    return [symbols[id] for id in ids]


def arabic_to_phonemes(arabic):
    buckw = arabic_to_buckwalter(arabic)
    return process_utterance(buckw)


def buckwalter_to_phonemes(buckw):
    return process_utterance(buckw)


def phonemes_to_tokens(phonemes: str, append_space=True):
    phonemes = phonemes \
        .replace("sil", "") \
        .replace("+", "_+_") \
        .split()
    for i, phon in enumerate(phonemes):
        if len(phon) == 2 and phon not in vowels and phon[0] == phon[1]:
            phonemes[i] = phon[0]
            phonemes.insert(i+1, DOUBLING_TOKEN)
        if phonemes[i] in vowels:
            phonemes[i] = vowel_map[phonemes[i]]

    if append_space:
        phonemes.append(SEPARATOR_TOKEN)
   
    phonemes.append(EOS_TOKEN)

    return phonemes


def buckwalter_to_tokens(buckw, append_space=True):
    phonemes = buckwalter_to_phonemes(buckw)
    tokens = phonemes_to_tokens(phonemes, append_space=append_space)
    return tokens


def arabic_to_tokens(arabic, append_space=True):
    buckw = arabic_to_buckwalter(arabic)
    tokens = buckwalter_to_tokens(buckw, append_space=append_space)
    return tokens


def simplify_phonemes(phonemes):
    for k, v in vowel_map.items():
        phonemes = phonemes.replace(k, v)
    return phonemes