Spaces:

Dadmatech
/

persian_informal_translator

Running

App Files Files Community

persian_informal_translator / tokenizer.py

mohammadkrb

init streamlit based app

6227608 over 1 year ago

raw

history blame contribute delete

6.41 kB

	import itertools
	import utils
	class InformalTokenizer:
	def __init__(self, vocab, postfixes):
	self.vocab = vocab
	self.pres = InformalTokenizer.get_prefixs()
	self.posts = postfixes

	@staticmethod
	def get_prefixs():
	return ['نا', 'بی', 'هر', 'می']

	@staticmethod
	def get_postfixs(informal_postfix_addr):
	with open(informal_postfix_addr, 'r') as f:
	ps = f.read().splitlines()
	return ps


	def is_pre_post_word(self, w):
	nim_fasele = '‌'
	ws = w.split(nim_fasele)
	pre, pos, v = [0,1,2]
	is_pre_pos = False
	state = pre
	valid_w = ''
	for w in ws:
	if state == pre:
	if w in self.pres:
	valid_w += nim_fasele + w
	is_pre_pos = True
	continue
	elif w in self.posts:
	valid_w += nim_fasele + w
	is_pre_pos = True
	state = pos
	continue
	state = v
	valid_w += nim_fasele + w
	continue

	if state == pos:
	if w in self.posts:
	valid_w += nim_fasele + w
	continue
	return False
	if state == v:
	if w in self.posts:
	is_pre_pos = True
	state = pos
	valid_w += nim_fasele + w
	continue
	if w in self.vocab:
	valid_w += nim_fasele + w
	if valid_w not in self.vocab:
	return False
	continue

	return False
	if not is_pre_pos:
	return False
	return True


	def get_valid_word(self, words):
	seps = ['', '‌']
	all_seqs = []
	count = len(words)
	lst = list(itertools.product(seps, repeat=count-1))
	for item in lst:
	seq = ''
	for word, sep in zip(words[:-1], item):
	seq += word + sep
	seq += words[-1]
	all_seqs.append(seq)
	return [w for w in all_seqs if w in self.vocab or self.is_pre_post_word(w)]

	def get_candidates(self, tokens, index=0, current_seq = ' '):
	if index == len(tokens):
	return current_seq
	word = tokens[index]
	next_word, next_next_word = [None, None]
	if index < len(tokens) -1:
	next_word = tokens[index+1]
	if index < len(tokens) -2:
	next_next_word = tokens[index+2]
	cnds = []
	if next_word is not None:
	v_words = self.get_valid_word([word, next_word])
	if v_words:
	for v_w in v_words:
	current_seq1 = current_seq + ' ' + v_w
	cnds2 = self.get_candidates(tokens,index+2, current_seq1)
	if type(cnds2) == str:
	cnds.append(cnds2)
	else:
	cnds.extend(cnds2)
	if next_next_word is not None:
	v_words = self.get_valid_word([word, next_word, next_next_word])
	if v_words:
	for v_w in v_words:
	current_seq2 = current_seq + ' ' + v_w
	cnds3 = self.get_candidates(tokens,index+3, current_seq2)
	if type(cnds3) == str:
	cnds.append(cnds3)
	else:
	cnds.extend(cnds3)
	current_seq = current_seq + ' ' + word
	cnds1 = self.get_candidates(tokens,index+1, current_seq)
	if type(cnds1) == str:
	cnds.append(cnds1)
	else:
	cnds.extend(cnds1)
	return [c.strip() for c in cnds]

	def seperate_conjs(self, word, validator):
	conjs = ['و', 'در', 'با', 'تا', 'که', 'از', 'تو', 'من', 'شما']
	cnds = utils.split_conj_words(word, conjs)
	valid_cnds = [c for c in cnds if validator(c)]
	if valid_cnds:
	return valid_cnds
	return [word]

	def tokenize(self, txt, validator):
	tokens = txt.split()
	all_cnds = []
	for t in tokens:
	if not validator(t):
	ws = self.seperate_conjs(t, validator)
	else:
	ws = [t]
	all_cnds.append(ws)
	all_cnd_tokens = itertools.product(*all_cnds)
	txts = list(map(self.get_dense_tokens, all_cnd_tokens))
	return txts

	def get_dense_tokens(self, tokens):
	PRE, WORD, POST = 0,1,2
	out_tokens = []
	nim_fasele = '‌'
	current_word = ''
	state = WORD
	for i, t in enumerate(tokens):
	if state == WORD:
	if t in self.pres:
	out_tokens.append(current_word)
	current_word = t
	state = PRE
	if t in self.posts:
	current_word += nim_fasele
	current_word += t
	state = POST
	if t not in self.pres and t not in self.posts:
	out_tokens.append(current_word)
	current_word = t
	continue
	if state == PRE:
	if t in self.pres:
	current_word += nim_fasele
	current_word += t
	if t in self.posts:
	out_tokens.append(current_word)
	current_word = t
	state = WORD
	if t not in self.pres and t not in self.posts:
	current_word += nim_fasele
	current_word += t
	state = WORD
	continue
	if state == POST:
	if t in self.pres:
	out_tokens.append(current_word)
	current_word = t
	state = PRE
	if t in self.posts:
	current_word += nim_fasele
	current_word += t
	if t not in self.pres and t not in self.posts:
	out_tokens.append(current_word)
	current_word = t
	state = WORD
	if out_tokens[-1] != current_word:
	out_tokens.append(current_word)
	return out_tokens