Spaces:
Build error
Build error
# -*- coding: utf-8 -*- | |
import re | |
from os.path import join, abspath, dirname | |
from collections import defaultdict | |
import epitran | |
epi = epitran.Epitran("deu-Latn-nar") | |
def mode_type(mode_in): | |
"""In the case of "sql", this will return an sqlite cursor.""" | |
if mode_in.lower() == "sql": | |
import sqlite3 | |
conn = sqlite3.connect(join(abspath(dirname(__file__)), "./Resources/de.db")) | |
return conn.cursor() | |
#TESTS | |
#NUMBERS ARE TOO HARD! | |
def preprocess(words): | |
"""Returns a string of words stripped of punctuation""" | |
punct_str = '!"#$%&\'()*+,-./:;<=>/?@[\\]^_`{|}~«» ' | |
return ' '.join([w.strip(punct_str).lower() for w in words.split()]) | |
def preserve_punc(words): | |
"""converts words to IPA and finds punctuation before and after the word.""" | |
words_preserved = [] | |
for w in words.split(): | |
punct_list = ["", preprocess(w), ""] | |
before = re.search("^([^A-Za-z0-9]+)[A-Za-z]", w) | |
after = re.search("[A-Za-z]([^A-Za-z0-9]+)$", w) | |
if before: | |
punct_list[0] = str(before.group(1)) | |
if after: | |
punct_list[2] = str(after.group(1)) | |
words_preserved.append(punct_list) | |
return words_preserved | |
def apply_punct(triple, as_str=False): | |
"""places surrounding punctuation back on center on a list of preserve_punc triples""" | |
if type(triple[0]) == list: | |
for i, t in enumerate(triple): | |
triple[i] = str(''.join(triple[i])) | |
if as_str: | |
return ' '.join(triple) | |
return triple | |
if as_str: | |
return str(''.join(t for t in triple)) | |
return [''.join(t for t in triple)] | |
def _punct_replace_word(original, transcription): | |
"""Get the IPA transcription of word with the original punctuation marks""" | |
for i, trans_list in enumerate(transcription): | |
for j, item in enumerate(trans_list): | |
triple = [original[i][0]] + [item] + [original[i][2]] | |
transcription[i][j] = apply_punct(triple, as_str=True) | |
return transcription | |
def fetch_words(words_in, db_type="sql"): | |
"""fetches a list of words from the database""" | |
asset = mode_type(db_type) | |
f_result = [] | |
if db_type.lower() == "sql": | |
for word in words_in: | |
asset.execute("SELECT Words, phonemes FROM De_words WHERE Words IN (?)", (word,)) | |
result = asset.fetchall() | |
flag = True | |
try: | |
f_result.append(result.pop()) | |
flag = False | |
except IndexError: | |
pass | |
if result == [] and flag is True: | |
result = epi.transliterate(word) | |
f_result.append((word, result)) | |
f_result = list(filter(None,f_result)) | |
f_set = set(f_result) | |
d = defaultdict(list) | |
for k, v in f_set: | |
d[k].append(v) | |
return list(d.items()) | |
def get_deu(tokens_in, db_type="sql"): | |
"""query the SQL database for the words and return the phonemes in the order of user_in""" | |
result = fetch_words(tokens_in, db_type) | |
ordered = [] | |
for word in tokens_in: | |
this_word = [[i[1] for i in result if i[0] == word]][0] | |
if this_word: | |
ordered.append(this_word[0]) | |
else: | |
ordered.append(["__IGNORE__" + word]) | |
return ordered | |
def deu_to_ipa(deu_list, mark=True): | |
"""converts the deu word lists into IPA transcriptions""" | |
symbols = {} | |
ipa_list = [] # the final list of IPA tokens to be returned | |
for word_list in deu_list: | |
ipa_word_list = [] # the word list for each word | |
for word in word_list: | |
if re.sub("\d*", "", word.replace("__IGNORE__", "")) == "": | |
pass # do not delete token if it's all numbers | |
else: | |
word = re.sub("[0-9]", "", word) | |
ipa_form = '' | |
if word.startswith("__IGNORE__"): | |
ipa_form = word.replace("__IGNORE__", "") | |
# mark words we couldn't transliterate with an asterisk: | |
if mark: | |
if not re.sub("\d*", "", ipa_form) == "": | |
ipa_form += "*" | |
else: | |
for piece in word.split(" "): | |
marked = False | |
unmarked = piece | |
if piece[0] in ["ˈ", "ˌ"] or piece[0] is None: | |
marked = True | |
mark = piece | |
unmarked = piece[1:] | |
if unmarked in symbols: | |
if marked: | |
ipa_form += mark + symbols[unmarked] | |
else: | |
ipa_form += symbols[unmarked] | |
else: | |
ipa_form += piece | |
swap_list = [["ˈər", "əˈr"], ["ˈie", "iˈe"]] | |
for sym in swap_list: | |
if not ipa_form.startswith(sym[0]): | |
ipa_form = ipa_form.replace(sym[0], sym[1]) | |
ipa_word_list.append(ipa_form) | |
ipa_list.append(sorted(list(set(ipa_word_list)))) | |
return ipa_list | |
def get_top(ipa_list): | |
"""Returns only the one result for a query. If multiple entries for words are found, only the first is used.""" | |
return ' '.join([word_list[-1] for word_list in ipa_list]) | |
def get_all(ipa_list): | |
"""utilizes an algorithm to discover and return all possible combinations of IPA transcriptions""" | |
final_size = 1 | |
for word_list in ipa_list: | |
final_size *= len(word_list) | |
list_all = ["" for s in range(final_size)] | |
for i in range(len(ipa_list)): | |
if i == 0: | |
swtich_rate = final_size / len(ipa_list[i]) | |
else: | |
swtich_rate /= len(ipa_list[i]) | |
k = 0 | |
for j in range(final_size): | |
if (j+1) % int(swtich_rate) == 0: | |
k += 1 | |
if k == len(ipa_list[i]): | |
k = 0 | |
list_all[j] = list_all[j] + ipa_list[i][k] + " " | |
return sorted([sent[:-1] for sent in list_all]) | |
def ipa_list(words_in, keep_punct=True, db_type="sql"): | |
"""Returns a list of all the discovered IPA transcriptions for each word.""" | |
if type(words_in) == str: | |
words = [preserve_punc(w.lower())[0] for w in words_in.split()] | |
else: | |
words = [preserve_punc(w.lower())[0] for w in words_in] | |
deu = get_deu([w[1] for w in words], db_type=db_type) | |
ipa = deu_to_ipa(deu) | |
if keep_punct: | |
ipa = _punct_replace_word(words, ipa) | |
return ipa | |
def isin_deu(word, db_type="sql"): | |
"""checks if a word is in the deu dictionary. Doesn't strip punctuation. | |
If given more than one word, returns True only if all words are present.""" | |
if type(word) == str: | |
word = [preprocess(w) for w in word.split()] | |
results = fetch_words(word, db_type) | |
as_set = list(set(t[0] for t in results)) | |
return len(as_set) == len(set(word)) | |
def replace_number(text): | |
text = text.replace("1","eins ") | |
text = text.replace("2","zwei ") | |
text = text.replace("3","drei ") | |
text = text.replace("4","vier ") | |
text = text.replace("5","fünf ") | |
text = text.replace("6","sechs ") | |
text = text.replace("7","sieben ") | |
text = text.replace("8","acht ") | |
text = text.replace("9","neun ") | |
text = text.replace("0","null ") | |
return text | |
def convert(text, retrieve_all=False, keep_punct=True, mode="sql"): | |
"""takes either a string or list of German words and converts them to IPA""" | |
text = replace_number(text) | |
ipa = ipa_list( | |
words_in=text, | |
keep_punct=keep_punct, | |
db_type=mode) | |
if retrieve_all: | |
return get_all(ipa) | |
return get_top(ipa) | |
_decimal_number_re = re.compile(r'\d+\,\d+') | |
_euros_pre = re.compile(r'€([0-9\,]*[0-9]+)') | |
_euros_re = re.compile(r'([0-9\,]*[0-9]+)€') | |
_ordinal_re = re.compile(r'(der |die |das )([0-9]+)\.') | |
_clock_re=re.compile(r'\d{1,2}\:\d{2}') | |
_number_re = re.compile(r'[0-9]+') | |
def base(text): | |
text = text.replace("1", "eins ") | |
text = text.replace("2", "zwei ") | |
text = text.replace("3", "drei ") | |
text = text.replace("4", "vier ") | |
text = text.replace("5", "fünf ") | |
text = text.replace("6", "sechs ") | |
text = text.replace("7", "sieben ") | |
text = text.replace("8", "acht ") | |
text = text.replace("9", "neun ") | |
text = text.replace("0", "null ") | |
return text | |
def tens_to_word(num): | |
tens = num[0] | |
ones = num[1] | |
ones_word = base(ones) | |
if num =="10": | |
return "zehn" | |
elif num=="11": | |
return "elf" | |
elif num=="12": | |
return "zwölf" | |
if tens == "1": | |
if ones == "6": | |
ones_word = ones_word[:-1] | |
elif ones == "7": | |
ones_word = ones_word[:-2] | |
return ones_word + "zehn" | |
else: | |
tens_word = base(tens) | |
if ones == "1": | |
ones_word = ones_word[:-1] | |
if tens == "2": | |
tens_word = "zwan" | |
elif tens == "6": | |
tens_word = tens_word[:-1] | |
elif tens == "7": | |
tens_word = tens_word[:-2] | |
if tens == "3": | |
tens_word += "ßig" | |
else: | |
tens_word += "zig" | |
if ones == "0": | |
return tens_word | |
else: | |
return ones_word + " und " + tens_word | |
def huns_to_word(num): | |
huns = num[0] | |
tens = num[1] | |
if huns == "1": | |
huns_word= "hundert" | |
else: | |
huns_word = base(huns)+" hundert" | |
remain = num_to_word(num[1:]) | |
if remain != "": | |
remain = " " + remain | |
return huns_word + remain | |
def thos_to_word(num): | |
thos = num[0] | |
if thos == "1": | |
thos_word= "tausend" | |
else: | |
thos_word = base(thos)+" tausend" | |
remain=num_to_word(num[1:]) | |
if remain!="": | |
remain=" "+remain | |
return thos_word+remain | |
def num_to_word(num): | |
num=num.lstrip("0") | |
if num=="": | |
return("") | |
digit=len(num) | |
if digit==1: | |
return base(num) | |
elif digit==2: | |
return tens_to_word(num) | |
elif digit == 3: | |
return huns_to_word(num) | |
elif digit == 4: | |
return thos_to_word(num) | |
else: | |
return base(num) | |
def number_to_words(m): | |
m=m.group(0).lstrip("0") | |
if m=="": | |
return"null" | |
return num_to_word(m) | |
def _expand_ordinal(m): | |
pre=m.group(1) | |
m = m.group(2).lstrip("0") | |
if m=="": | |
return"NULL" | |
num=int(m) | |
if num<=19 & num>=1: | |
if num ==1: | |
return "erste" | |
elif num==3: | |
return "dritte" | |
elif num==7: | |
return "siebte" | |
elif num==8: | |
return "achte" | |
else: | |
return pre + num_to_word(m) + "te" | |
else: | |
return pre + num_to_word(m) + "ste" | |
def _expand_decimal(m): | |
match=m.group(0) | |
parts = match.split(',') | |
if int(parts[0])==0: | |
return '%s komma %s' % ("null", base(parts[1])) | |
return '%s komma %s' % (num_to_word(parts[0]),base(parts[1])) | |
def _expand_euros(m): | |
match = m.group(1) | |
parts = match.split(',') | |
if len(parts) > 2: | |
return match + ' euro' # Unexpected format | |
euros = int(parts[0]) if parts[0] else 0 | |
cents = int(parts[1])*10 if len(parts) > 1 and parts[1] else 0 | |
if euros and cents: | |
return '%s euro %s' % (euros, cents) | |
elif euros: | |
return '%s euro' % (euros) | |
elif cents: | |
return '%s cent' % (cents) | |
else: | |
return 'null euro' | |
def _expand_clock(m): | |
match = m.group(0) | |
parts = match.split(':') | |
if int(parts[0]) == 0: | |
return '%s Uhr %s' % ("null",num_to_word(parts[1])) | |
elif int(parts[0]) == 1: | |
return '%s Uhr %s' % ("ein", num_to_word(parts[1])) | |
return '%s Uhr %s' % (num_to_word(parts[0]),num_to_word(parts[1])) | |
def normalize_numbers(text): | |
text = re.sub(_euros_pre, _expand_euros, text) | |
text = re.sub(_euros_re, _expand_euros, text) | |
text = re.sub(_clock_re, _expand_clock, text) | |
text = re.sub(_decimal_number_re, _expand_decimal, text) | |
text = re.sub(_ordinal_re, _expand_ordinal, text) | |
text = re.sub(_number_re, number_to_words, text) | |
text=text.replace(" "," ") | |
return text | |
def collapse_whitespace(text): | |
return re.sub(r'\s+', ' ', text) | |
def mark_dark_l(text): | |
return re.sub(r'l([^aeiouæɑɔəɛɪʊ ]*(?: |$))', lambda x: 'ɫ'+x.group(1), text) | |