import itertools |
import utils |
class InformalTokenizer: |
def __init__(self, vocab, postfixes): |
self.vocab = vocab |
self.pres = InformalTokenizer.get_prefixs() |
self.posts = postfixes |
@staticmethod |
def get_prefixs(): |
return ['نا', 'بی', 'هر', 'می'] |
@staticmethod |
def get_postfixs(informal_postfix_addr): |
with open(informal_postfix_addr, 'r') as f: |
ps = f.read().splitlines() |
return ps |
def is_pre_post_word(self, w): |
nim_fasele = '' |
ws = w.split(nim_fasele) |
pre, pos, v = [0,1,2] |
is_pre_pos = False |
state = pre |
valid_w = '' |
for w in ws: |
if state == pre: |
if w in self.pres: |
valid_w += nim_fasele + w |
is_pre_pos = True |
continue |
elif w in self.posts: |
valid_w += nim_fasele + w |
is_pre_pos = True |
state = pos |
continue |
state = v |
valid_w += nim_fasele + w |
continue |
if state == pos: |
if w in self.posts: |
valid_w += nim_fasele + w |
continue |
return False |
if state == v: |
if w in self.posts: |
is_pre_pos = True |
state = pos |
valid_w += nim_fasele + w |
continue |
if w in self.vocab: |
valid_w += nim_fasele + w |
if valid_w not in self.vocab: |
return False |
continue |
return False |
if not is_pre_pos: |
return False |
return True |
def get_valid_word(self, words): |
seps = ['', ''] |
all_seqs = [] |
count = len(words) |
lst = list(itertools.product(seps, repeat=count-1)) |
for item in lst: |
seq = '' |
for word, sep in zip(words[:-1], item): |
seq += word + sep |
seq += words[-1] |
all_seqs.append(seq) |
return [w for w in all_seqs if w in self.vocab or self.is_pre_post_word(w)] |
def get_candidates(self, tokens, index=0, current_seq = ' '): |
if index == len(tokens): |
return current_seq |
word = tokens[index] |
next_word, next_next_word = [None, None] |
if index < len(tokens) -1: |
next_word = tokens[index+1] |
if index < len(tokens) -2: |
next_next_word = tokens[index+2] |
cnds = [] |
if next_word is not None: |
v_words = self.get_valid_word([word, next_word]) |
if v_words: |
for v_w in v_words: |
current_seq1 = current_seq + ' ' + v_w |
cnds2 = self.get_candidates(tokens,index+2, current_seq1) |
if type(cnds2) == str: |
cnds.append(cnds2) |
else: |
cnds.extend(cnds2) |
if next_next_word is not None: |
v_words = self.get_valid_word([word, next_word, next_next_word]) |
if v_words: |
for v_w in v_words: |
current_seq2 = current_seq + ' ' + v_w |
cnds3 = self.get_candidates(tokens,index+3, current_seq2) |
if type(cnds3) == str: |
cnds.append(cnds3) |
else: |
cnds.extend(cnds3) |
current_seq = current_seq + ' ' + word |
cnds1 = self.get_candidates(tokens,index+1, current_seq) |
if type(cnds1) == str: |
cnds.append(cnds1) |
else: |
cnds.extend(cnds1) |
return [c.strip() for c in cnds] |
def seperate_conjs(self, word, validator): |
conjs = ['و', 'در', 'با', 'تا', 'که', 'از', 'تو', 'من', 'شما'] |
cnds = utils.split_conj_words(word, conjs) |
valid_cnds = [c for c in cnds if validator(c)] |
if valid_cnds: |
return valid_cnds |
return [word] |
def tokenize(self, txt, validator): |
tokens = txt.split() |
all_cnds = [] |
for t in tokens: |
if not validator(t): |
ws = self.seperate_conjs(t, validator) |
else: |
ws = [t] |
all_cnds.append(ws) |
all_cnd_tokens = itertools.product(*all_cnds) |
txts = list(map(self.get_dense_tokens, all_cnd_tokens)) |
return txts |
def get_dense_tokens(self, tokens): |
PRE, WORD, POST = 0,1,2 |
out_tokens = [] |
nim_fasele = '' |
current_word = '' |
state = WORD |
for i, t in enumerate(tokens): |
if state == WORD: |
if t in self.pres: |
out_tokens.append(current_word) |
current_word = t |
state = PRE |
if t in self.posts: |
current_word += nim_fasele |
current_word += t |
state = POST |
if t not in self.pres and t not in self.posts: |
out_tokens.append(current_word) |
current_word = t |
continue |
if state == PRE: |
if t in self.pres: |
current_word += nim_fasele |
current_word += t |
if t in self.posts: |
out_tokens.append(current_word) |
current_word = t |
state = WORD |
if t not in self.pres and t not in self.posts: |
current_word += nim_fasele |
current_word += t |
state = WORD |
continue |
if state == POST: |
if t in self.pres: |
out_tokens.append(current_word) |
current_word = t |
state = PRE |
if t in self.posts: |
current_word += nim_fasele |
current_word += t |
if t not in self.pres and t not in self.posts: |
out_tokens.append(current_word) |
current_word = t |
state = WORD |
if out_tokens[-1] != current_word: |
out_tokens.append(current_word) |
return out_tokens |