Spaces:

CarlDennis
/

HYTTS

Build error

HYTTS / text /ger_to_ipa.py

云淡风轻

init

cbf648c over 2 years ago

12.3 kB

	# -- coding: utf-8 --
	import re
	from os.path import join, abspath, dirname
	from collections import defaultdict
	import epitran

	epi = epitran.Epitran("deu-Latn-nar")


	def mode_type(mode_in):
	"""In the case of "sql", this will return an sqlite cursor."""
	if mode_in.lower() == "sql":
	import sqlite3
	conn = sqlite3.connect(join(abspath(dirname(__file__)), "./Resources/de.db"))
	return conn.cursor()


	#TESTS
	#NUMBERS ARE TOO HARD!



	def preprocess(words):
	"""Returns a string of words stripped of punctuation"""
	punct_str = '!"#$%&\'()*+,-./:;<=>/?@[\\]^_`{\|}~«» '
	return ' '.join([w.strip(punct_str).lower() for w in words.split()])


	def preserve_punc(words):
	"""converts words to IPA and finds punctuation before and after the word."""
	words_preserved = []
	for w in words.split():
	punct_list = ["", preprocess(w), ""]
	before = re.search("^([^A-Za-z0-9]+)[A-Za-z]", w)
	after = re.search("[A-Za-z]([^A-Za-z0-9]+)$", w)
	if before:
	punct_list[0] = str(before.group(1))
	if after:
	punct_list[2] = str(after.group(1))
	words_preserved.append(punct_list)
	return words_preserved



	def apply_punct(triple, as_str=False):
	"""places surrounding punctuation back on center on a list of preserve_punc triples"""
	if type(triple[0]) == list:
	for i, t in enumerate(triple):
	triple[i] = str(''.join(triple[i]))
	if as_str:
	return ' '.join(triple)
	return triple
	if as_str:
	return str(''.join(t for t in triple))
	return [''.join(t for t in triple)]


	def _punct_replace_word(original, transcription):
	"""Get the IPA transcription of word with the original punctuation marks"""
	for i, trans_list in enumerate(transcription):
	for j, item in enumerate(trans_list):
	triple = [original[i][0]] + [item] + [original[i][2]]
	transcription[i][j] = apply_punct(triple, as_str=True)
	return transcription


	def fetch_words(words_in, db_type="sql"):
	"""fetches a list of words from the database"""
	asset = mode_type(db_type)
	f_result = []
	if db_type.lower() == "sql":
	for word in words_in:
	asset.execute("SELECT Words, phonemes FROM De_words WHERE Words IN (?)", (word,))
	result = asset.fetchall()
	flag = True
	try:
	f_result.append(result.pop())
	flag = False
	except IndexError:
	pass
	if result == [] and flag is True:
	result = epi.transliterate(word)
	f_result.append((word, result))
	f_result = list(filter(None,f_result))
	f_set = set(f_result)
	d = defaultdict(list)
	for k, v in f_set:
	d[k].append(v)
	return list(d.items())

	def get_deu(tokens_in, db_type="sql"):
	"""query the SQL database for the words and return the phonemes in the order of user_in"""
	result = fetch_words(tokens_in, db_type)
	ordered = []
	for word in tokens_in:
	this_word = [[i[1] for i in result if i[0] == word]][0]
	if this_word:
	ordered.append(this_word[0])
	else:
	ordered.append(["__IGNORE__" + word])
	return ordered


	def deu_to_ipa(deu_list, mark=True):
	"""converts the deu word lists into IPA transcriptions"""
	symbols = {}
	ipa_list = [] # the final list of IPA tokens to be returned
	for word_list in deu_list:
	ipa_word_list = [] # the word list for each word
	for word in word_list:
	if re.sub("\d*", "", word.replace("__IGNORE__", "")) == "":
	pass # do not delete token if it's all numbers
	else:
	word = re.sub("[0-9]", "", word)
	ipa_form = ''
	if word.startswith("__IGNORE__"):
	ipa_form = word.replace("__IGNORE__", "")
	# mark words we couldn't transliterate with an asterisk:

	if mark:
	if not re.sub("\d*", "", ipa_form) == "":
	ipa_form += "*"
	else:
	for piece in word.split(" "):
	marked = False
	unmarked = piece
	if piece[0] in ["ˈ", "ˌ"] or piece[0] is None:
	marked = True
	mark = piece
	unmarked = piece[1:]

	if unmarked in symbols:
	if marked:
	ipa_form += mark + symbols[unmarked]
	else:
	ipa_form += symbols[unmarked]

	else:
	ipa_form += piece
	swap_list = [["ˈər", "əˈr"], ["ˈie", "iˈe"]]
	for sym in swap_list:
	if not ipa_form.startswith(sym[0]):
	ipa_form = ipa_form.replace(sym[0], sym[1])
	ipa_word_list.append(ipa_form)
	ipa_list.append(sorted(list(set(ipa_word_list))))
	return ipa_list


	def get_top(ipa_list):
	"""Returns only the one result for a query. If multiple entries for words are found, only the first is used."""
	return ' '.join([word_list[-1] for word_list in ipa_list])


	def get_all(ipa_list):
	"""utilizes an algorithm to discover and return all possible combinations of IPA transcriptions"""
	final_size = 1
	for word_list in ipa_list:
	final_size *= len(word_list)
	list_all = ["" for s in range(final_size)]
	for i in range(len(ipa_list)):
	if i == 0:
	swtich_rate = final_size / len(ipa_list[i])
	else:
	swtich_rate /= len(ipa_list[i])
	k = 0
	for j in range(final_size):
	if (j+1) % int(swtich_rate) == 0:
	k += 1
	if k == len(ipa_list[i]):
	k = 0
	list_all[j] = list_all[j] + ipa_list[i][k] + " "
	return sorted([sent[:-1] for sent in list_all])


	def ipa_list(words_in, keep_punct=True, db_type="sql"):
	"""Returns a list of all the discovered IPA transcriptions for each word."""
	if type(words_in) == str:
	words = [preserve_punc(w.lower())[0] for w in words_in.split()]
	else:
	words = [preserve_punc(w.lower())[0] for w in words_in]
	deu = get_deu([w[1] for w in words], db_type=db_type)
	ipa = deu_to_ipa(deu)
	if keep_punct:
	ipa = _punct_replace_word(words, ipa)
	return ipa


	def isin_deu(word, db_type="sql"):
	"""checks if a word is in the deu dictionary. Doesn't strip punctuation.
	If given more than one word, returns True only if all words are present."""
	if type(word) == str:
	word = [preprocess(w) for w in word.split()]
	results = fetch_words(word, db_type)
	as_set = list(set(t[0] for t in results))
	return len(as_set) == len(set(word))

	def replace_number(text):
	text = text.replace("1","eins ")
	text = text.replace("2","zwei ")
	text = text.replace("3","drei ")
	text = text.replace("4","vier ")
	text = text.replace("5","fünf ")
	text = text.replace("6","sechs ")
	text = text.replace("7","sieben ")
	text = text.replace("8","acht ")
	text = text.replace("9","neun ")
	text = text.replace("0","null ")
	return text



	def convert(text, retrieve_all=False, keep_punct=True, mode="sql"):
	"""takes either a string or list of German words and converts them to IPA"""
	text = replace_number(text)
	ipa = ipa_list(
	words_in=text,
	keep_punct=keep_punct,
	db_type=mode)
	if retrieve_all:
	return get_all(ipa)
	return get_top(ipa)



	_decimal_number_re = re.compile(r'\d+\,\d+')
	_euros_pre = re.compile(r'€([0-9\,]*[0-9]+)')
	_euros_re = re.compile(r'([0-9\,]*[0-9]+)€')
	_ordinal_re = re.compile(r'(der \|die \|das )([0-9]+)\.')
	_clock_re=re.compile(r'\d{1,2}\:\d{2}')
	_number_re = re.compile(r'[0-9]+')

	def base(text):
	text = text.replace("1", "eins ")
	text = text.replace("2", "zwei ")
	text = text.replace("3", "drei ")
	text = text.replace("4", "vier ")
	text = text.replace("5", "fünf ")
	text = text.replace("6", "sechs ")
	text = text.replace("7", "sieben ")
	text = text.replace("8", "acht ")
	text = text.replace("9", "neun ")
	text = text.replace("0", "null ")
	return text

	def tens_to_word(num):
	tens = num[0]
	ones = num[1]
	ones_word = base(ones)

	if num =="10":
	return "zehn"
	elif num=="11":
	return "elf"
	elif num=="12":
	return "zwölf"

	if tens == "1":
	if ones == "6":
	ones_word = ones_word[:-1]
	elif ones == "7":
	ones_word = ones_word[:-2]
	return ones_word + "zehn"
	else:
	tens_word = base(tens)
	if ones == "1":
	ones_word = ones_word[:-1]
	if tens == "2":
	tens_word = "zwan"
	elif tens == "6":
	tens_word = tens_word[:-1]
	elif tens == "7":
	tens_word = tens_word[:-2]
	if tens == "3":
	tens_word += "ßig"
	else:
	tens_word += "zig"
	if ones == "0":
	return tens_word
	else:
	return ones_word + " und " + tens_word

	def huns_to_word(num):
	huns = num[0]
	tens = num[1]

	if huns == "1":
	huns_word= "hundert"
	else:
	huns_word = base(huns)+" hundert"

	remain = num_to_word(num[1:])
	if remain != "":
	remain = " " + remain
	return huns_word + remain

	def thos_to_word(num):
	thos = num[0]
	if thos == "1":
	thos_word= "tausend"
	else:
	thos_word = base(thos)+" tausend"
	remain=num_to_word(num[1:])
	if remain!="":
	remain=" "+remain
	return thos_word+remain

	def num_to_word(num):
	num=num.lstrip("0")
	if num=="":
	return("")
	digit=len(num)
	if digit==1:
	return base(num)
	elif digit==2:
	return tens_to_word(num)
	elif digit == 3:
	return huns_to_word(num)
	elif digit == 4:
	return thos_to_word(num)
	else:
	return base(num)

	def number_to_words(m):
	m=m.group(0).lstrip("0")
	if m=="":
	return"null"
	return num_to_word(m)

	def _expand_ordinal(m):

	pre=m.group(1)
	m = m.group(2).lstrip("0")

	if m=="":
	return"NULL"
	num=int(m)
	if num<=19 & num>=1:
	if num ==1:
	return "erste"
	elif num==3:
	return "dritte"
	elif num==7:
	return "siebte"
	elif num==8:
	return "achte"
	else:
	return pre + num_to_word(m) + "te"
	else:
	return pre + num_to_word(m) + "ste"

	def _expand_decimal(m):
	match=m.group(0)
	parts = match.split(',')
	if int(parts[0])==0:
	return '%s komma %s' % ("null", base(parts[1]))
	return '%s komma %s' % (num_to_word(parts[0]),base(parts[1]))

	def _expand_euros(m):
	match = m.group(1)
	parts = match.split(',')
	if len(parts) > 2:
	return match + ' euro' # Unexpected format
	euros = int(parts[0]) if parts[0] else 0
	cents = int(parts[1])*10 if len(parts) > 1 and parts[1] else 0
	if euros and cents:
	return '%s euro %s' % (euros, cents)
	elif euros:
	return '%s euro' % (euros)
	elif cents:
	return '%s cent' % (cents)
	else:
	return 'null euro'

	def _expand_clock(m):
	match = m.group(0)
	parts = match.split(':')
	if int(parts[0]) == 0:
	return '%s Uhr %s' % ("null",num_to_word(parts[1]))
	elif int(parts[0]) == 1:
	return '%s Uhr %s' % ("ein", num_to_word(parts[1]))
	return '%s Uhr %s' % (num_to_word(parts[0]),num_to_word(parts[1]))

	def normalize_numbers(text):
	text = re.sub(_euros_pre, _expand_euros, text)
	text = re.sub(_euros_re, _expand_euros, text)
	text = re.sub(_clock_re, _expand_clock, text)
	text = re.sub(_decimal_number_re, _expand_decimal, text)
	text = re.sub(_ordinal_re, _expand_ordinal, text)
	text = re.sub(_number_re, number_to_words, text)
	text=text.replace(" "," ")
	return text

	def collapse_whitespace(text):
	return re.sub(r'\s+', ' ', text)

	def mark_dark_l(text):
	return re.sub(r'l([^aeiouæɑɔəɛɪʊ ]*(?: \|$))', lambda x: 'ɫ'+x.group(1), text)