Spaces:

CarlDennis
/

HYTTS

Build error

File size: 12,346 Bytes

cbf648c

# -*- coding: utf-8 -*-
import re
from os.path import join, abspath, dirname
from collections import defaultdict
import epitran

epi = epitran.Epitran("deu-Latn-nar")


def mode_type(mode_in):
    """In the case of "sql", this will return an sqlite cursor."""
    if mode_in.lower() == "sql":
        import sqlite3
        conn = sqlite3.connect(join(abspath(dirname(__file__)), "./Resources/de.db"))
        return conn.cursor()


#TESTS
#NUMBERS ARE TOO HARD!



def preprocess(words):
    """Returns a string of words stripped of punctuation"""
    punct_str = '!"#$%&\'()*+,-./:;<=>/?@[\\]^_`{|}~«» '
    return ' '.join([w.strip(punct_str).lower() for w in words.split()])


def preserve_punc(words):
    """converts words to IPA and finds punctuation before and after the word."""
    words_preserved = []
    for w in words.split():
        punct_list = ["", preprocess(w), ""]
        before = re.search("^([^A-Za-z0-9]+)[A-Za-z]", w)
        after = re.search("[A-Za-z]([^A-Za-z0-9]+)$", w)
        if before:
            punct_list[0] = str(before.group(1))
        if after:
            punct_list[2] = str(after.group(1))
        words_preserved.append(punct_list)
    return words_preserved



def apply_punct(triple, as_str=False):
    """places surrounding punctuation back on center on a list of preserve_punc triples"""
    if type(triple[0]) == list:
        for i, t in enumerate(triple):
            triple[i] = str(''.join(triple[i]))
        if as_str:
            return ' '.join(triple)
        return triple
    if as_str:
        return str(''.join(t for t in triple))
    return [''.join(t for t in triple)]


def _punct_replace_word(original, transcription):
    """Get the IPA transcription of word with the original punctuation marks"""
    for i, trans_list in enumerate(transcription):
        for j, item in enumerate(trans_list):
            triple = [original[i][0]] + [item] + [original[i][2]]
            transcription[i][j] = apply_punct(triple, as_str=True)
    return transcription


def fetch_words(words_in, db_type="sql"):
    """fetches a list of words from the database"""
    asset = mode_type(db_type)
    f_result = []
    if db_type.lower() == "sql":
        for word in words_in:
            asset.execute("SELECT Words, phonemes FROM De_words WHERE Words IN (?)", (word,))
            result = asset.fetchall()
            flag = True
            try:
                f_result.append(result.pop())
                flag = False
            except IndexError:
                pass
            if result == [] and flag is True:
                result = epi.transliterate(word)
                f_result.append((word, result))
        f_result = list(filter(None,f_result))
        f_set = set(f_result)
        d = defaultdict(list)
        for k, v in f_set:
            d[k].append(v)
        return list(d.items())

def get_deu(tokens_in, db_type="sql"):
    """query the SQL database for the words and return the phonemes in the order of user_in"""
    result = fetch_words(tokens_in, db_type)
    ordered = []
    for word in tokens_in:
        this_word = [[i[1] for i in result if i[0] == word]][0]
        if this_word:
            ordered.append(this_word[0])
        else:
            ordered.append(["__IGNORE__" + word])
    return ordered


def deu_to_ipa(deu_list, mark=True):
    """converts the deu word lists into IPA transcriptions"""
    symbols = {}
    ipa_list = []  # the final list of IPA tokens to be returned
    for word_list in deu_list:
        ipa_word_list = []  # the word list for each word
        for word in word_list:
            if re.sub("\d*", "", word.replace("__IGNORE__", "")) == "":
                    pass  # do not delete token if it's all numbers
            else:
                    word = re.sub("[0-9]", "", word)
            ipa_form = ''
            if word.startswith("__IGNORE__"):
                ipa_form = word.replace("__IGNORE__", "")
                # mark words we couldn't transliterate with an asterisk:

                if mark:
                    if not re.sub("\d*", "", ipa_form) == "":
                        ipa_form += "*"
            else:
                for piece in word.split(" "):
                    marked = False
                    unmarked = piece
                    if piece[0] in ["ˈ", "ˌ"] or piece[0] is None:
                        marked = True
                        mark = piece
                        unmarked = piece[1:]

                    if unmarked in symbols:
                        if marked:
                            ipa_form += mark + symbols[unmarked]
                        else:
                            ipa_form += symbols[unmarked]

                    else:
                        ipa_form += piece
            swap_list = [["ˈər", "əˈr"], ["ˈie", "iˈe"]]
            for sym in swap_list:
                if not ipa_form.startswith(sym[0]):
                    ipa_form = ipa_form.replace(sym[0], sym[1])
            ipa_word_list.append(ipa_form)
        ipa_list.append(sorted(list(set(ipa_word_list))))
    return ipa_list


def get_top(ipa_list):
    """Returns only the one result for a query. If multiple entries for words are found, only the first is used."""
    return ' '.join([word_list[-1] for word_list in ipa_list])


def get_all(ipa_list):
    """utilizes an algorithm to discover and return all possible combinations of IPA transcriptions"""
    final_size = 1
    for word_list in ipa_list:
        final_size *= len(word_list)
    list_all = ["" for s in range(final_size)]
    for i in range(len(ipa_list)):
        if i == 0:
            swtich_rate = final_size / len(ipa_list[i])
        else:
            swtich_rate /= len(ipa_list[i])
        k = 0
        for j in range(final_size):
            if (j+1) % int(swtich_rate) == 0:
                k += 1
            if k == len(ipa_list[i]):
                k = 0
            list_all[j] = list_all[j] + ipa_list[i][k] + " "
    return sorted([sent[:-1] for sent in list_all])


def ipa_list(words_in, keep_punct=True, db_type="sql"):
    """Returns a list of all the discovered IPA transcriptions for each word."""
    if type(words_in) == str:
        words = [preserve_punc(w.lower())[0] for w in words_in.split()]
    else:
        words = [preserve_punc(w.lower())[0] for w in words_in]
    deu = get_deu([w[1] for w in words], db_type=db_type)
    ipa = deu_to_ipa(deu)
    if keep_punct:
        ipa = _punct_replace_word(words, ipa)
    return ipa


def isin_deu(word, db_type="sql"):
    """checks if a word is in the deu dictionary. Doesn't strip punctuation.
    If given more than one word, returns True only if all words are present."""
    if type(word) == str:
        word = [preprocess(w) for w in word.split()]
    results = fetch_words(word, db_type)
    as_set = list(set(t[0] for t in results))
    return len(as_set) == len(set(word))

def replace_number(text):
    text = text.replace("1","eins ")
    text = text.replace("2","zwei ")
    text = text.replace("3","drei ")
    text = text.replace("4","vier ")
    text = text.replace("5","fünf ")
    text = text.replace("6","sechs ")
    text = text.replace("7","sieben ")
    text = text.replace("8","acht ")
    text = text.replace("9","neun ")
    text = text.replace("0","null ")
    return text



def convert(text, retrieve_all=False, keep_punct=True, mode="sql"):
    """takes either a string or list of German words and converts them to IPA"""
    text = replace_number(text)
    ipa = ipa_list(
                   words_in=text,
                   keep_punct=keep_punct,
                   db_type=mode)
    if retrieve_all:
        return get_all(ipa)
    return get_top(ipa)



_decimal_number_re = re.compile(r'\d+\,\d+')
_euros_pre = re.compile(r'€([0-9\,]*[0-9]+)')
_euros_re = re.compile(r'([0-9\,]*[0-9]+)€')
_ordinal_re = re.compile(r'(der |die |das )([0-9]+)\.')
_clock_re=re.compile(r'\d{1,2}\:\d{2}')
_number_re = re.compile(r'[0-9]+')

def base(text):
    text = text.replace("1", "eins ")
    text = text.replace("2", "zwei ")
    text = text.replace("3", "drei ")
    text = text.replace("4", "vier ")
    text = text.replace("5", "fünf ")
    text = text.replace("6", "sechs ")
    text = text.replace("7", "sieben ")
    text = text.replace("8", "acht ")
    text = text.replace("9", "neun ")
    text = text.replace("0", "null ")
    return text

def tens_to_word(num):
    tens = num[0]
    ones = num[1]
    ones_word = base(ones)

    if num =="10":
        return "zehn"
    elif num=="11":
        return "elf"
    elif num=="12":
        return "zwölf"

    if tens == "1":
        if ones == "6":
            ones_word = ones_word[:-1]
        elif ones == "7":
            ones_word = ones_word[:-2]
        return ones_word + "zehn"
    else:
        tens_word = base(tens)
        if ones == "1":
            ones_word = ones_word[:-1]
        if tens == "2":
            tens_word = "zwan"
        elif tens == "6":
            tens_word = tens_word[:-1]
        elif tens == "7":
            tens_word = tens_word[:-2]
        if tens == "3":
            tens_word += "ßig"
        else:
            tens_word += "zig"
        if ones == "0":
            return tens_word
        else:
            return ones_word + " und " + tens_word

def huns_to_word(num):
    huns = num[0]
    tens = num[1]

    if huns == "1":
        huns_word= "hundert"
    else:
        huns_word = base(huns)+" hundert"

    remain = num_to_word(num[1:])
    if remain != "":
        remain = " " + remain
    return huns_word + remain

def thos_to_word(num):
    thos = num[0]
    if thos == "1":
        thos_word= "tausend"
    else:
        thos_word = base(thos)+" tausend"
    remain=num_to_word(num[1:])
    if remain!="":
        remain=" "+remain
    return thos_word+remain

def num_to_word(num):
    num=num.lstrip("0")
    if num=="":
        return("")
    digit=len(num)
    if digit==1:
        return base(num)
    elif digit==2:
        return tens_to_word(num)
    elif digit == 3:
        return huns_to_word(num)
    elif digit == 4:
        return thos_to_word(num)
    else:
        return base(num)

def number_to_words(m):
    m=m.group(0).lstrip("0")
    if m=="":
        return"null"
    return num_to_word(m)

def _expand_ordinal(m):

    pre=m.group(1)
    m = m.group(2).lstrip("0")

    if m=="":
        return"NULL"
    num=int(m)
    if num<=19 & num>=1:
        if num ==1:
            return "erste"
        elif num==3:
            return "dritte"
        elif num==7:
            return "siebte"
        elif num==8:
            return "achte"
        else:
            return pre + num_to_word(m) + "te"
    else:
        return pre + num_to_word(m) + "ste"

def _expand_decimal(m):
    match=m.group(0)
    parts = match.split(',')
    if int(parts[0])==0:
        return '%s komma %s' % ("null", base(parts[1]))
    return '%s komma %s' % (num_to_word(parts[0]),base(parts[1]))

def _expand_euros(m):
    match = m.group(1)
    parts = match.split(',')
    if len(parts) > 2:
        return match + ' euro'  # Unexpected format
    euros = int(parts[0]) if parts[0] else 0
    cents = int(parts[1])*10 if len(parts) > 1 and parts[1] else 0
    if euros and cents:
        return '%s euro %s' % (euros, cents)
    elif euros:
        return '%s euro' % (euros)
    elif cents:
        return '%s cent' % (cents)
    else:
        return 'null euro'

def _expand_clock(m):
    match = m.group(0)
    parts = match.split(':')
    if int(parts[0]) == 0:
        return '%s Uhr %s' % ("null",num_to_word(parts[1]))
    elif int(parts[0]) == 1:
        return '%s Uhr %s' % ("ein", num_to_word(parts[1]))
    return '%s Uhr %s' % (num_to_word(parts[0]),num_to_word(parts[1]))

def normalize_numbers(text):
    text = re.sub(_euros_pre, _expand_euros, text)
    text = re.sub(_euros_re, _expand_euros, text)
    text = re.sub(_clock_re, _expand_clock, text)
    text = re.sub(_decimal_number_re, _expand_decimal, text)
    text = re.sub(_ordinal_re, _expand_ordinal, text)
    text = re.sub(_number_re, number_to_words, text)
    text=text.replace("  "," ")
    return text

def collapse_whitespace(text):
    return re.sub(r'\s+', ' ', text)

def mark_dark_l(text):
    return re.sub(r'l([^aeiouæɑɔəɛɪʊ ]*(?: |$))', lambda x: 'ɫ'+x.group(1), text)