mohammadkrb's picture
init streamlit based app
6227608
import itertools
import utils
class InformalTokenizer:
def __init__(self, vocab, postfixes):
self.vocab = vocab
self.pres = InformalTokenizer.get_prefixs()
self.posts = postfixes
@staticmethod
def get_prefixs():
return ['نا', 'بی', 'هر', 'می']
@staticmethod
def get_postfixs(informal_postfix_addr):
with open(informal_postfix_addr, 'r') as f:
ps = f.read().splitlines()
return ps
def is_pre_post_word(self, w):
nim_fasele = '‌'
ws = w.split(nim_fasele)
pre, pos, v = [0,1,2]
is_pre_pos = False
state = pre
valid_w = ''
for w in ws:
if state == pre:
if w in self.pres:
valid_w += nim_fasele + w
is_pre_pos = True
continue
elif w in self.posts:
valid_w += nim_fasele + w
is_pre_pos = True
state = pos
continue
state = v
valid_w += nim_fasele + w
continue
if state == pos:
if w in self.posts:
valid_w += nim_fasele + w
continue
return False
if state == v:
if w in self.posts:
is_pre_pos = True
state = pos
valid_w += nim_fasele + w
continue
if w in self.vocab:
valid_w += nim_fasele + w
if valid_w not in self.vocab:
return False
continue
return False
if not is_pre_pos:
return False
return True
def get_valid_word(self, words):
seps = ['', '‌']
all_seqs = []
count = len(words)
lst = list(itertools.product(seps, repeat=count-1))
for item in lst:
seq = ''
for word, sep in zip(words[:-1], item):
seq += word + sep
seq += words[-1]
all_seqs.append(seq)
return [w for w in all_seqs if w in self.vocab or self.is_pre_post_word(w)]
def get_candidates(self, tokens, index=0, current_seq = ' '):
if index == len(tokens):
return current_seq
word = tokens[index]
next_word, next_next_word = [None, None]
if index < len(tokens) -1:
next_word = tokens[index+1]
if index < len(tokens) -2:
next_next_word = tokens[index+2]
cnds = []
if next_word is not None:
v_words = self.get_valid_word([word, next_word])
if v_words:
for v_w in v_words:
current_seq1 = current_seq + ' ' + v_w
cnds2 = self.get_candidates(tokens,index+2, current_seq1)
if type(cnds2) == str:
cnds.append(cnds2)
else:
cnds.extend(cnds2)
if next_next_word is not None:
v_words = self.get_valid_word([word, next_word, next_next_word])
if v_words:
for v_w in v_words:
current_seq2 = current_seq + ' ' + v_w
cnds3 = self.get_candidates(tokens,index+3, current_seq2)
if type(cnds3) == str:
cnds.append(cnds3)
else:
cnds.extend(cnds3)
current_seq = current_seq + ' ' + word
cnds1 = self.get_candidates(tokens,index+1, current_seq)
if type(cnds1) == str:
cnds.append(cnds1)
else:
cnds.extend(cnds1)
return [c.strip() for c in cnds]
def seperate_conjs(self, word, validator):
conjs = ['و', 'در', 'با', 'تا', 'که', 'از', 'تو', 'من', 'شما']
cnds = utils.split_conj_words(word, conjs)
valid_cnds = [c for c in cnds if validator(c)]
if valid_cnds:
return valid_cnds
return [word]
def tokenize(self, txt, validator):
tokens = txt.split()
all_cnds = []
for t in tokens:
if not validator(t):
ws = self.seperate_conjs(t, validator)
else:
ws = [t]
all_cnds.append(ws)
all_cnd_tokens = itertools.product(*all_cnds)
txts = list(map(self.get_dense_tokens, all_cnd_tokens))
return txts
def get_dense_tokens(self, tokens):
PRE, WORD, POST = 0,1,2
out_tokens = []
nim_fasele = '‌'
current_word = ''
state = WORD
for i, t in enumerate(tokens):
if state == WORD:
if t in self.pres:
out_tokens.append(current_word)
current_word = t
state = PRE
if t in self.posts:
current_word += nim_fasele
current_word += t
state = POST
if t not in self.pres and t not in self.posts:
out_tokens.append(current_word)
current_word = t
continue
if state == PRE:
if t in self.pres:
current_word += nim_fasele
current_word += t
if t in self.posts:
out_tokens.append(current_word)
current_word = t
state = WORD
if t not in self.pres and t not in self.posts:
current_word += nim_fasele
current_word += t
state = WORD
continue
if state == POST:
if t in self.pres:
out_tokens.append(current_word)
current_word = t
state = PRE
if t in self.posts:
current_word += nim_fasele
current_word += t
if t not in self.pres and t not in self.posts:
out_tokens.append(current_word)
current_word = t
state = WORD
if out_tokens[-1] != current_word:
out_tokens.append(current_word)
return out_tokens