# -*- coding: utf-8 -*- import re from os.path import join, abspath, dirname from collections import defaultdict import epitran epi = epitran.Epitran("deu-Latn-nar") def mode_type(mode_in): """In the case of "sql", this will return an sqlite cursor.""" if mode_in.lower() == "sql": import sqlite3 conn = sqlite3.connect(join(abspath(dirname(__file__)), "./Resources/de.db")) return conn.cursor() #TESTS #NUMBERS ARE TOO HARD! def preprocess(words): """Returns a string of words stripped of punctuation""" punct_str = '!"#$%&\'()*+,-./:;<=>/?@[\\]^_`{|}~«» ' return ' '.join([w.strip(punct_str).lower() for w in words.split()]) def preserve_punc(words): """converts words to IPA and finds punctuation before and after the word.""" words_preserved = [] for w in words.split(): punct_list = ["", preprocess(w), ""] before = re.search("^([^A-Za-z0-9]+)[A-Za-z]", w) after = re.search("[A-Za-z]([^A-Za-z0-9]+)$", w) if before: punct_list[0] = str(before.group(1)) if after: punct_list[2] = str(after.group(1)) words_preserved.append(punct_list) return words_preserved def apply_punct(triple, as_str=False): """places surrounding punctuation back on center on a list of preserve_punc triples""" if type(triple[0]) == list: for i, t in enumerate(triple): triple[i] = str(''.join(triple[i])) if as_str: return ' '.join(triple) return triple if as_str: return str(''.join(t for t in triple)) return [''.join(t for t in triple)] def _punct_replace_word(original, transcription): """Get the IPA transcription of word with the original punctuation marks""" for i, trans_list in enumerate(transcription): for j, item in enumerate(trans_list): triple = [original[i][0]] + [item] + [original[i][2]] transcription[i][j] = apply_punct(triple, as_str=True) return transcription def fetch_words(words_in, db_type="sql"): """fetches a list of words from the database""" asset = mode_type(db_type) f_result = [] if db_type.lower() == "sql": for word in words_in: asset.execute("SELECT Words, phonemes FROM De_words WHERE Words IN (?)", (word,)) result = asset.fetchall() flag = True try: f_result.append(result.pop()) flag = False except IndexError: pass if result == [] and flag is True: result = epi.transliterate(word) f_result.append((word, result)) f_result = list(filter(None,f_result)) f_set = set(f_result) d = defaultdict(list) for k, v in f_set: d[k].append(v) return list(d.items()) def get_deu(tokens_in, db_type="sql"): """query the SQL database for the words and return the phonemes in the order of user_in""" result = fetch_words(tokens_in, db_type) ordered = [] for word in tokens_in: this_word = [[i[1] for i in result if i[0] == word]][0] if this_word: ordered.append(this_word[0]) else: ordered.append(["__IGNORE__" + word]) return ordered def deu_to_ipa(deu_list, mark=True): """converts the deu word lists into IPA transcriptions""" symbols = {} ipa_list = [] # the final list of IPA tokens to be returned for word_list in deu_list: ipa_word_list = [] # the word list for each word for word in word_list: if re.sub("\d*", "", word.replace("__IGNORE__", "")) == "": pass # do not delete token if it's all numbers else: word = re.sub("[0-9]", "", word) ipa_form = '' if word.startswith("__IGNORE__"): ipa_form = word.replace("__IGNORE__", "") # mark words we couldn't transliterate with an asterisk: if mark: if not re.sub("\d*", "", ipa_form) == "": ipa_form += "*" else: for piece in word.split(" "): marked = False unmarked = piece if piece[0] in ["ˈ", "ˌ"] or piece[0] is None: marked = True mark = piece unmarked = piece[1:] if unmarked in symbols: if marked: ipa_form += mark + symbols[unmarked] else: ipa_form += symbols[unmarked] else: ipa_form += piece swap_list = [["ˈər", "əˈr"], ["ˈie", "iˈe"]] for sym in swap_list: if not ipa_form.startswith(sym[0]): ipa_form = ipa_form.replace(sym[0], sym[1]) ipa_word_list.append(ipa_form) ipa_list.append(sorted(list(set(ipa_word_list)))) return ipa_list def get_top(ipa_list): """Returns only the one result for a query. If multiple entries for words are found, only the first is used.""" return ' '.join([word_list[-1] for word_list in ipa_list]) def get_all(ipa_list): """utilizes an algorithm to discover and return all possible combinations of IPA transcriptions""" final_size = 1 for word_list in ipa_list: final_size *= len(word_list) list_all = ["" for s in range(final_size)] for i in range(len(ipa_list)): if i == 0: swtich_rate = final_size / len(ipa_list[i]) else: swtich_rate /= len(ipa_list[i]) k = 0 for j in range(final_size): if (j+1) % int(swtich_rate) == 0: k += 1 if k == len(ipa_list[i]): k = 0 list_all[j] = list_all[j] + ipa_list[i][k] + " " return sorted([sent[:-1] for sent in list_all]) def ipa_list(words_in, keep_punct=True, db_type="sql"): """Returns a list of all the discovered IPA transcriptions for each word.""" if type(words_in) == str: words = [preserve_punc(w.lower())[0] for w in words_in.split()] else: words = [preserve_punc(w.lower())[0] for w in words_in] deu = get_deu([w[1] for w in words], db_type=db_type) ipa = deu_to_ipa(deu) if keep_punct: ipa = _punct_replace_word(words, ipa) return ipa def isin_deu(word, db_type="sql"): """checks if a word is in the deu dictionary. Doesn't strip punctuation. If given more than one word, returns True only if all words are present.""" if type(word) == str: word = [preprocess(w) for w in word.split()] results = fetch_words(word, db_type) as_set = list(set(t[0] for t in results)) return len(as_set) == len(set(word)) def replace_number(text): text = text.replace("1","eins ") text = text.replace("2","zwei ") text = text.replace("3","drei ") text = text.replace("4","vier ") text = text.replace("5","fünf ") text = text.replace("6","sechs ") text = text.replace("7","sieben ") text = text.replace("8","acht ") text = text.replace("9","neun ") text = text.replace("0","null ") return text def convert(text, retrieve_all=False, keep_punct=True, mode="sql"): """takes either a string or list of German words and converts them to IPA""" text = replace_number(text) ipa = ipa_list( words_in=text, keep_punct=keep_punct, db_type=mode) if retrieve_all: return get_all(ipa) return get_top(ipa) _decimal_number_re = re.compile(r'\d+\,\d+') _euros_pre = re.compile(r'€([0-9\,]*[0-9]+)') _euros_re = re.compile(r'([0-9\,]*[0-9]+)€') _ordinal_re = re.compile(r'(der |die |das )([0-9]+)\.') _clock_re=re.compile(r'\d{1,2}\:\d{2}') _number_re = re.compile(r'[0-9]+') def base(text): text = text.replace("1", "eins ") text = text.replace("2", "zwei ") text = text.replace("3", "drei ") text = text.replace("4", "vier ") text = text.replace("5", "fünf ") text = text.replace("6", "sechs ") text = text.replace("7", "sieben ") text = text.replace("8", "acht ") text = text.replace("9", "neun ") text = text.replace("0", "null ") return text def tens_to_word(num): tens = num[0] ones = num[1] ones_word = base(ones) if num =="10": return "zehn" elif num=="11": return "elf" elif num=="12": return "zwölf" if tens == "1": if ones == "6": ones_word = ones_word[:-1] elif ones == "7": ones_word = ones_word[:-2] return ones_word + "zehn" else: tens_word = base(tens) if ones == "1": ones_word = ones_word[:-1] if tens == "2": tens_word = "zwan" elif tens == "6": tens_word = tens_word[:-1] elif tens == "7": tens_word = tens_word[:-2] if tens == "3": tens_word += "ßig" else: tens_word += "zig" if ones == "0": return tens_word else: return ones_word + " und " + tens_word def huns_to_word(num): huns = num[0] tens = num[1] if huns == "1": huns_word= "hundert" else: huns_word = base(huns)+" hundert" remain = num_to_word(num[1:]) if remain != "": remain = " " + remain return huns_word + remain def thos_to_word(num): thos = num[0] if thos == "1": thos_word= "tausend" else: thos_word = base(thos)+" tausend" remain=num_to_word(num[1:]) if remain!="": remain=" "+remain return thos_word+remain def num_to_word(num): num=num.lstrip("0") if num=="": return("") digit=len(num) if digit==1: return base(num) elif digit==2: return tens_to_word(num) elif digit == 3: return huns_to_word(num) elif digit == 4: return thos_to_word(num) else: return base(num) def number_to_words(m): m=m.group(0).lstrip("0") if m=="": return"null" return num_to_word(m) def _expand_ordinal(m): pre=m.group(1) m = m.group(2).lstrip("0") if m=="": return"NULL" num=int(m) if num<=19 & num>=1: if num ==1: return "erste" elif num==3: return "dritte" elif num==7: return "siebte" elif num==8: return "achte" else: return pre + num_to_word(m) + "te" else: return pre + num_to_word(m) + "ste" def _expand_decimal(m): match=m.group(0) parts = match.split(',') if int(parts[0])==0: return '%s komma %s' % ("null", base(parts[1])) return '%s komma %s' % (num_to_word(parts[0]),base(parts[1])) def _expand_euros(m): match = m.group(1) parts = match.split(',') if len(parts) > 2: return match + ' euro' # Unexpected format euros = int(parts[0]) if parts[0] else 0 cents = int(parts[1])*10 if len(parts) > 1 and parts[1] else 0 if euros and cents: return '%s euro %s' % (euros, cents) elif euros: return '%s euro' % (euros) elif cents: return '%s cent' % (cents) else: return 'null euro' def _expand_clock(m): match = m.group(0) parts = match.split(':') if int(parts[0]) == 0: return '%s Uhr %s' % ("null",num_to_word(parts[1])) elif int(parts[0]) == 1: return '%s Uhr %s' % ("ein", num_to_word(parts[1])) return '%s Uhr %s' % (num_to_word(parts[0]),num_to_word(parts[1])) def normalize_numbers(text): text = re.sub(_euros_pre, _expand_euros, text) text = re.sub(_euros_re, _expand_euros, text) text = re.sub(_clock_re, _expand_clock, text) text = re.sub(_decimal_number_re, _expand_decimal, text) text = re.sub(_ordinal_re, _expand_ordinal, text) text = re.sub(_number_re, number_to_words, text) text=text.replace(" "," ") return text def collapse_whitespace(text): return re.sub(r'\s+', ' ', text) def mark_dark_l(text): return re.sub(r'l([^aeiouæɑɔəɛɪʊ ]*(?: |$))', lambda x: 'ɫ'+x.group(1), text)