Spaces:

Dadmatech
/

persian_informal_translator

Running

File size: 30,335 Bytes
import re
import itertools
import string
import utils


class InformalWord:
    def __init__(self, lemma, prefixs=None, postfixs=None, pos=None, append_h=False):
        if prefixs is None:
            prefixs = []
        if postfixs is None:
            postfixs = []
        self.is_verb = False
        self.is_mapper = False
        self.semi_mapper = False
        self.append_h = append_h
        self.lemma = lemma
        self.prefixs = prefixs
        self.postfixs = postfixs
        self.pos = pos

class Prefix:
    def __init__(self, word, level, formal=None, ignore_poses=None, poses=None, non_connecting_chars=None, connector='nim'):
        if non_connecting_chars is None:
            non_connecting_chars = []
        self.word = word
        self.level = level
        self.ignore_poses = ignore_poses
        self.poses = poses
        self.connector = connector
        if formal is None:
            self.formal = word
        else:
            self.formal = formal
        self.non_connecting_chars = non_connecting_chars
class Postfix:
    def __init__(self, word, level, formal=None, ignore_poses=None, non_connecting_chars=None, poses=None, connector='nim'):
        if non_connecting_chars is None:
            non_connecting_chars = []
        self.word = word
        self.level = level
        self.ignore_poses = ignore_poses
        self.poses = poses
        self.connector = connector
        if formal is None:
            self.formal = word
        else:
            self.formal = formal
        self.non_connecting_chars = non_connecting_chars



class OneShotTransformer:

    NIM_FASELE = chr(8204)
    # prefixs
    HAMUN = Prefix('همون', 1, 'همان',connector='fasele',non_connecting_chars=['ه'])
    HAMIN = Prefix('همین', 1,connector='fasele')
    HAR = Prefix('هر', 1,connector='fasele')
    UN = Prefix('اون', 1, 'آن',connector='fasele',non_connecting_chars=['ه'])
    IN = Prefix('این', 1,connector='fasele',non_connecting_chars=['ه'])
    HICH = Prefix('هیچ', 1,connector='nim',non_connecting_chars=['ه', 'ا', 'آ'])
    B = Prefix('ب', 1, 'به', ignore_poses=['VERB', 'CCONJ', 'SCONJ'],connector='fasele',non_connecting_chars=['ا', 'ه', 'آ'])
    Y = Prefix('ی', 1, 'یک', ignore_poses=['VERB', 'CCONJ', 'SCONJ'],connector='fasele',non_connecting_chars=['ا', 'آ'])
    BI = Prefix('بی', 1, ignore_poses=['VERB'],connector='nim',non_connecting_chars=['ا'])
    POR = Prefix('پر', 1, ignore_poses=['VERB'],connector='nim')
    pres = [[HAMIN, HAMUN, UN, IN, HAMIN, BI, B, Y, POR, HAR]]
    #postfixs
    Y1 = Postfix('ی', 0, ignore_poses=['VERB'], connector='none',non_connecting_chars=['ی', 'ا', 'و', 'آ', 'اً'])
    TAR = Postfix('تر', 1, connector='nim')
    TARIN = Postfix('ترین', 1, connector='nim')
    HAY = Postfix('های', 2, connector='nim')
    HA = Postfix('ها', 2, connector='nim')
    A = Postfix('ا', 2, 'ها', ignore_poses=['VERB'], connector='nim',non_connecting_chars=['ا', 'و', 'آ', 'اً'])
    A1 = Postfix('ای', 2, 'های', ignore_poses=['VERB'], connector='nim',non_connecting_chars=['ا', 'و', 'آ', 'اً'])
    YY = Postfix('یی', 3, 'یی', ignore_poses=['VERB'], connector='none')
    M = Postfix('م', 3, ignore_poses=['VERB'], connector='none')
    M_MAN = Postfix('م', 3, 'من', ignore_poses=['VERB'], connector='fasele')
    T = Postfix('ت', 3, connector='none')
    T1 = Postfix('ت', 3, 'تو', connector='fasele')
    # T2 = Postfix('ت', 3, 'خود', ignore_poses=['VERB'], connector='fasele')
    SH = Postfix('ش', 3, connector='none')
    # SH1 = Postfix('ش', 3, 'خود', connector='fasele')
    # SH2 = Postfix('ش', 3, 'آن', connector='fasele')
    # SH3 = Postfix('ش', 3, 'او', connector='fasele')
    MAN = Postfix('مان', 3, connector='nim')
    MAN1 = Postfix('مان', 3, 'ما', connector='fasele')
    # MAN2 = Postfix('مان', 3, 'خود', connector='fasele')
    MUN = Postfix('مون', 3, 'مان', connector='nim')
    # MUN1 = Postfix('مون', 3, 'خود', connector='fasele')
    MUN2 = Postfix('مون', 3, 'ما', connector='fasele')
    TAN = Postfix('تان', 3, connector='nim')
    # TAN1 = Postfix('تان', 3, 'خود', connector='fasele')
    TAN2 = Postfix('تان', 3, 'شما', connector='fasele')
    TUN = Postfix('تون', 3, 'تان', connector='nim')
    # TUN1 = Postfix('تون', 3, 'خود', connector='fasele')
    TUN2 = Postfix('تون', 3, 'شما', connector='fasele')
    SHAN = Postfix('شان', 3, connector='nim')
    # SHAN1 = Postfix('شان', 3, 'خود', connector='fasele')
    SHAN2 = Postfix('شان', 3, 'آنان', connector='fasele')
    SHUN = Postfix('شون', 3, 'شان', connector='nim')
    # SHUN1 = Postfix('شون', 3, 'خود', connector='fasele')
    SHUN2 = Postfix('شون', 3, 'آنان', connector='fasele')
    N = Postfix('ن', 4, 'هستند', ignore_poses=['VERB', 'CCONJ', 'SCONJ'], connector='fasele', non_connecting_chars=['ی'])
    SHAM = Postfix('شم', 4, 'بشوم',ignore_poses=['VERB'], connector='fasele')
    SHI= Postfix('شی', 4, 'بشوی',ignore_poses=['VERB'], connector='fasele')
    SHE= Postfix('شه', 4, 'شود',ignore_poses=['VERB'], connector='fasele')
    SHIN= Postfix('شین', 4, 'شوید',ignore_poses=['VERB'], connector='fasele')
    SHID= Postfix('شید', 4, 'شوید',ignore_poses=['VERB'], connector='fasele')
    SHAAN= Postfix('شن', 4, 'شوند',ignore_poses=['VERB'], connector='fasele')
    SHAND= Postfix('شند', 4, 'شوند',ignore_poses=['VERB'], connector='fasele')
    M2 = Postfix('م', 4, 'هم',ignore_poses=['VERB'], connector='fasele')
    V = Postfix('و', 4, 'را', connector='fasele', non_connecting_chars=['ا', 'ای', 'آ', 'اً'])
    V1 = Postfix('رو', 4, 'را', connector='fasele')
    H = Postfix('ه', 4, '', ignore_poses=['VERB', 'CCONJ', 'SCONJ'], connector='none')
    # H2 = Postfix('ه', 4)
    M1 = Postfix('م', 4, 'هستم',ignore_poses=['VERB'], connector='fasele')
    Y2 = Postfix('ی', 4, 'ی', ignore_poses=['VERB'], connector='none')
    H1 = Postfix('ه', 4, 'است', ignore_poses=['VERB'], connector='fasele', non_connecting_chars=['ا', 'آ', 'اً'])
    S = Postfix('س', 4, 'است', connector='fasele')
    ST = Postfix('ست', 4, 'است', connector='fasele')
    ED = Postfix('ید', 4, 'هستید', ignore_poses=['VERB'], connector='fasele')
    EN = Postfix('ین', 4, 'هستید', ignore_poses=['VERB'], connector='fasele', non_connecting_chars=['تر'])
    EM = Postfix('یم', 4, 'هستیم', ignore_poses=['VERB'], connector='fasele')
    ND = Postfix('ند', 4, 'هستند', ignore_poses=['VERB'], connector='fasele')
    # posts = [[Y1], [TAR, TARIN], [HA, HAY, A, A1], [M, T, SH, MAN, MUN, TAN, TUN, SHAN, SHUN], [N, S, ST, M1, M2, V, V1,Y2, H, H1, ED, EN, EM, ND, SHAM, SHI, SHID, SHE, SHAND, SHIN, SHAAN]]
    # posts = [[Y1], [TAR, TARIN], [HA, HAY, A, A1], [YY, M, M_MAN, T, T1, T2, SH, MAN, MAN1, MAN2,MUN,MUN1,MUN2, TAN,TAN1,TAN2, TUN,TUN1,TUN2, SHAN,SHAN1,SHAN2, SHUN, SHUN1, SHUN2], [N, S, ST, M1, M2, V, V1,Y2, H1, ED, EN, EM, ND, SHAM, SHI, SHID, SHE, SHAND, SHIN, SHAAN]]
    posts = [[Y1], [TAR, TARIN], [HA, HAY, A, A1], [YY, M, M_MAN, T, T1,  SH, MAN, MAN1,MUN,MUN2, TAN,TAN2, TUN,TUN2, SHAN,SHAN2, SHUN, SHUN2], [N, S, ST, M1, M2, V, V1,Y2, H1, ED, EN, EM, ND, SHAM, SHI, SHID, SHE, SHAND, SHIN, SHAAN]]
    PossessiveـPronouns = [M,T,SH, MAN, MUN, TAN, TUN, SHAN, SHUN]
    cant_append_h_posts = [Y1, TAR, TARIN]
    As = [A, A1]

    def get_separator(self, w1, w2, append_h):
        connector_2_str = {'none': '', 'nim': OneShotTransformer.NIM_FASELE, 'fasele': ' '}
        not_connect_chars = ['ا', 'د', 'ذ', 'ر', 'ز', 'ژ', 'و']
        # if w2 == OneShotTransformer.Y2:
        #     return ''
        # if w2 in [OneShotTransformer.M, OneShotTransformer.T, OneShotTransformer.SH] and ( type(w1) == str and w1[-1] in ['ا', 'و']):
        #     return 'ی'
        # if type(w1) != str and w1.level == 1:
        #     return ' '
        # not_connect_chars = ['ا', 'د', 'ذ', 'ر', 'ز', 'ژ', 'و']
        # if w1 in [OneShotTransformer.Y, OneShotTransformer.B, OneShotTransformer.HAMIN, OneShotTransformer.IN, OneShotTransformer.HAMUN] or w2 in [OneShotTransformer.ED, OneShotTransformer.EN, OneShotTransformer.EM, OneShotTransformer.ND, OneShotTransformer.H1, OneShotTransformer.M1, OneShotTransformer.S, OneShotTransformer.ST, OneShotTransformer.V, OneShotTransformer.N, OneShotTransformer.M2]:
        #     return ' '
        #
        # if ((type(w1) == str and len(w1)> 0 and w1[-1] in ['ا', 'و']) or (type(w1) != str and  w1.formal[-1] in [ 'ا', 'و']))and w2.level == 3 :
        #     return 'ی' + '‌'
        # if (type(w1) == str and len(w1)> 0 and w1[-1] in not_connect_chars) or (type(w1) != str and w1.word[-1] in not_connect_chars):
        #     return ''
        all_pres = [p for pres in OneShotTransformer.pres for p in pres]
        all_posts = [p for posts in OneShotTransformer.posts for p in posts]
        if type(w1) == str:
            last_ch = w1[-1]
        else:
            last_ch = w1.word[-1]
        separator = ''
        extra_sep = ''
        if type(w1) == str and append_h and w2 in [OneShotTransformer.M, OneShotTransformer.T, OneShotTransformer.SH]:
            extra_sep = OneShotTransformer.NIM_FASELE + 'ا'
        if w2 in [OneShotTransformer.M, OneShotTransformer.T, OneShotTransformer.SH, OneShotTransformer.MAN, OneShotTransformer.MUN, OneShotTransformer.TAN, OneShotTransformer.TUN, OneShotTransformer.SHAN, OneShotTransformer.SHUN] and ( last_ch in ['ا', 'و']) :
            extra_sep = 'ی'
        if w1 in all_pres:
            separator = connector_2_str[w1.connector]
        if w2 in all_posts:
            separator = connector_2_str[w2.connector]

        # replace nim_fasele with '' for non connected words

        if last_ch in not_connect_chars and separator == OneShotTransformer.NIM_FASELE:
            separator = ''
        return extra_sep + separator

    def lemma_to_formals(self, iword):
        out_iwords = [iword]
        if iword.lemma in self.mapper and self.iword2str(iword) != self.mapper[iword.lemma]:
            for map_words in self.mapper[iword.lemma]:
                new_iw = InformalWord(lemma=map_words,prefixs=iword.prefixs, postfixs=iword.postfixs, pos=iword.pos, append_h=iword.append_h)
                if not iword.prefixs and not iword.postfixs:
                    new_iw.is_mapper = True
                    new_iw.semi_mapper = True
                else:
                    new_iw.semi_mapper = True
                out_iwords.append(new_iw)
        formal_verbs = self.verb_to_formal_func(iword.lemma)
        if formal_verbs is not None:
            for f_v in formal_verbs:
                new_iw = InformalWord(lemma=f_v,prefixs=iword.prefixs, postfixs=iword.postfixs, pos=iword.pos, append_h=iword.append_h)
                new_iw.is_verb = True
                out_iwords.append(new_iw)
        return out_iwords


    def should_ignore_by_postagg(self, iword):
        post_pres = [pre for pre in iword.prefixs] + [post for post in iword.postfixs]
        for p in post_pres:
            if (p.ignore_poses and iword.pos in p.ignore_poses) or (p.poses and iword.pos not in p.poses):
                return True
        return False

    def filtered_based_on_rules(self, iword):
        #YY
        ha_p = [OneShotTransformer.A, OneShotTransformer.HA]
        if iword.postfixs and OneShotTransformer.YY in iword.postfixs and not all(p in ha_p + [OneShotTransformer.YY] for p in iword.postfixs):
            return True
        #hasti!
        if (iword.postfixs and len(iword.postfixs) == 1 and OneShotTransformer.Y2 in iword.postfixs and iword.lemma and iword.lemma[-1] in ['و', 'ا']) or (iword.postfixs and len(iword.postfixs) == 2 and OneShotTransformer.Y2 in iword.postfixs and iword.postfixs[0] in [OneShotTransformer.A, OneShotTransformer.HA]):
            return True
        #non connecting chars
        if iword.prefixs:
            last_pre = iword.prefixs[-1]
            if last_pre.non_connecting_chars and iword.lemma and any(iword.lemma.startswith(ch) for ch in last_pre.non_connecting_chars):
                return True
        if iword.postfixs:
            first_post = iword.postfixs[0]
            if first_post.non_connecting_chars and iword.lemma and any(iword.lemma.endswith(ch) for ch in first_post.non_connecting_chars):
                return True
        #hidden H # goshnashe
        if not iword.semi_mapper and not iword.append_h and iword.lemma and iword.lemma[-1] == 'ه' and iword.postfixs  and iword.lemma not in self.non_hidden_h_words:
            return True
        # h + h
        if iword.prefixs and iword.postfixs and len(iword.lemma) < 2:
            return True
        # خونهه - خونششونه
        if iword.append_h and (OneShotTransformer.H in iword.postfixs or (len(iword.postfixs) == 1 and OneShotTransformer.H1 in iword.postfixs) ):
           return True
        if iword.prefixs and (OneShotTransformer.B in iword.prefixs or OneShotTransformer.Y in iword.prefixs) and (iword.lemma and iword.lemma[0] in ['ا', 'ی', 'و']):
            return True
        if iword.lemma in self.isolated_words and (iword.prefixs or iword.postfixs):
            return True
        # verb + postfixs ex:  برنامه
        if (iword.is_verb and iword.prefixs) or(iword.is_verb and iword.postfixs and (len(iword.postfixs) > 1 or not any(p in iword.postfixs for p in OneShotTransformer.PossessiveـPronouns +[OneShotTransformer.V]))):
            return True
        return False

    def iword2str(self, iword):
        sorted_prefixs = list(sorted(iword.prefixs, key=lambda prefix: prefix.level))
        sorted_postfixs = list(sorted(iword.postfixs, key=lambda postfix: postfix.level))
        concated_str = ''
        zipped_prefixs = [(sorted_prefixs[i], sorted_prefixs[i + 1]) if i < len(sorted_prefixs) - 1 else (
        sorted_prefixs[i], iword.lemma) for i in range(len(sorted_prefixs))]
        for prev_prefix, prefix in zipped_prefixs:
            separator = self.get_separator(prev_prefix, prefix, append_h=False)
            prefix_formal = prev_prefix.formal
            concated_str += prefix_formal
            concated_str += separator

        concated_str += iword.lemma

        zipped_postfix = [(sorted_postfixs[i - 1], sorted_postfixs[i]) if i > 0 else (iword.lemma, sorted_postfixs[i])
                          for i in range(len(sorted_postfixs))]
        for postfix, next_postfix in zipped_postfix:
            separator = self.get_separator(postfix, next_postfix, append_h=iword.append_h)
            concated_str += separator
            postfix_formal = next_postfix.formal
            concated_str += postfix_formal
        return concated_str

    def to_formals(self, iword):
        str_iwords = []
        all_iwords = self.lemma_to_formals(iword)
        for iword in all_iwords:
            # if iword.lemma == 'اون':
            #     print('')
            if len(iword.lemma) == 1 and iword.lemma != 'و':
                str_iwords.append(('', None))
                continue
            if self.filtered_based_on_rules(iword):
                str_iwords.append(('', None))
                continue
            if self.should_ignore_by_postagg(iword):
                str_iwords.append(('', None))
                continue
            if not iword.is_verb and not iword.semi_mapper and iword.lemma not in self.vocab:
                str_iwords.append(('', None))
                continue
            concated_str = self.iword2str(iword)
            str_iwords.append((concated_str, iword))
        return str_iwords

    def un_in(self, iword):
        new_lemma = iword.lemma.replace('ون', 'ان')
        if new_lemma != iword.lemma:
            return InformalWord(lemma=new_lemma, prefixs=iword.prefixs, postfixs=iword.postfixs, pos=iword.pos)
        else:
            return False

    def prefix_obj(self, word):
        op_separete = {'م': 'من', 'ت': 'تو', 'ش': 'آن', 'تان': 'شما', 'تون': 'شما', 'شون': 'آنان', 'شان': 'آنان',
                       'مان': 'ما', 'مون': 'ما'}
        candidates = []
        formal = ''
        m = self.pre_obj_pattern.match(word)
        if m:
            tokens = m.groups()
            if tokens[0] == 'باها':
                formal += 'با'
            else:
                formal += tokens[0]
            formal_obj = op_separete[tokens[1]]
            formal += ' '
            formal += formal_obj
            if tokens[2] is not None:
                formal += ' '
                formal += 'هم'
            alts = {'هم': 'هستم', 'آن': 'او'}
            tokens = [[w] for w in formal.split()]
            for t in tokens:
                if t[0] in alts:
                    t.append(alts[t[0]])

            candidates = itertools.product(*tokens)
            candidates = [' '.join(cnd) for cnd in candidates]

        return [(c, c) for c in candidates]



    def append_tanvin_hat(self, iword):
        if len(iword.lemma) > 1 and iword.lemma[0] == 'ا' and iword.lemma[-1] != 'ا':
            new_lemma = 'آ' + iword.lemma[1:]
            return InformalWord(lemma=new_lemma, prefixs=iword.prefixs, postfixs=iword.postfixs, pos=iword.pos)
        if len(iword.lemma) > 1 and iword.lemma[-1] == 'ا':
            new_lemma = iword.lemma[:-1] + 'اً'
            return InformalWord(lemma=new_lemma, prefixs=iword.prefixs, postfixs=iword.postfixs, pos=iword.pos)
        return False

    def append_h(self, iword):
        not_apply = self.verb_to_formal_func(iword.lemma) or (iword.lemma and iword.lemma[-1] in ['ا', 'و', 'ی'])  or len(iword.lemma) <= 1 or iword.lemma =='' or iword.lemma[-1] == 'ه' or (OneShotTransformer.H in iword.postfixs and len(iword.postfixs) == 1) or any(p in iword.postfixs for p in OneShotTransformer.As) or(OneShotTransformer.V in iword.postfixs) or (iword.postfixs and iword.postfixs[0].word[0] in ['ی', 'و','ا'])
        ######## when add h?
        new_lemma = iword.lemma + 'ه'
        ############# new_lemma in self.vocab
        if len(iword.postfixs) > 0 and not any([p in OneShotTransformer.cant_append_h_posts for p in iword.postfixs]) and not not_apply and new_lemma not in self.non_hidden_h_words:
        # if len(iword.postfixs) > 0 and not not_apply and new_lemma in self.vocab and new_lemma not in self.non_hidden_h_words:
            return InformalWord(lemma=new_lemma, prefixs=iword.prefixs, postfixs=iword.postfixs, pos=iword.pos, append_h= True)
        return False

    def __init__(self, vocab, mapper, verb_to_formal_func, ignore_words, postfix_mapper, isolated_words, non_hidden_h_words):
        self.vocab = vocab
        self.mapper = mapper
        self.verb_to_formal_func = verb_to_formal_func
        self.ignore_words = ignore_words
        self.postfix_mapper = postfix_mapper
        self.isolated_words = isolated_words
        self.non_hidden_h_words = non_hidden_h_words
        self.operators = [self.un_in, self.append_h, self.append_tanvin_hat]
        patt = r'(از|به|باها)(مان|شون|شان|مون|م|تون|تان|ت|ش)(م)?$'
        self.pre_obj_pattern = re.compile(patt)

    def all_sequence_of_postfixs(self, word, index):
        all_seqs  =[]
        for p in OneShotTransformer.posts[index]:
            p_w = p.word
            if word.startswith(p_w):
                w = word[len(p_w):]
                if len(w) == 0:
                    all_seqs.append(p)
                else:
                    if index < len(OneShotTransformer.posts) -1 :
                        resp = self.all_sequence_of_postfixs(w, index+1)
                        if len(resp) > 0:
                            for item in resp:
                                if type(item) == list:
                                    item.append(p)
                                    sequence_with_p = item
                                else:
                                    sequence_with_p = [p, item]
                                all_seqs.append(sequence_with_p)
        if index < len(OneShotTransformer.posts) - 1:
            resp = self.all_sequence_of_postfixs(word, index + 1)
            all_seqs.extend(resp)
        else:
            return all_seqs
        return all_seqs

    def combine(self, l1, l2):
        if len(l1) == 0:
            return l2
        elif len(l2) == 0:
            return l1
        return list(itertools.product(l1, l2))


    def get_expand(self, iword):
        all_possible_words = []
        for subset_operators in utils.powerset(self.operators):
            new_iword = InformalWord(lemma=iword.lemma, prefixs=iword.prefixs, postfixs=iword.postfixs, pos=iword.pos)
            for so in subset_operators:
                so_resp = so(new_iword)
                if so_resp:
                    new_iword = so_resp
            all_possible_words.append(new_iword)
        return all_possible_words


    def match_postfixs(self, word, pos):
        possible_combinatios = []
        for i in range(len(OneShotTransformer.posts)):
            for p in OneShotTransformer.posts[i]:
                p_word = p.word
                p_indxs = [indx for indx, ch in enumerate(word) if word[indx:indx+len(p_word)] == p_word]
                for p_indx in p_indxs:
                    if p_indx != -1:
                        lemma = word[:p_indx]
                        pp = word[p_indx + len(p_word):]
                        if len(pp) ==0:
                            iw = InformalWord(lemma=lemma, postfixs=[p], pos=pos)
                            possible_combinatios.append(iw)
                            continue
                        if i < len(OneShotTransformer.posts) -1:
                            all_postfix = self.all_sequence_of_postfixs(pp, index=i+1)
                            if len(all_postfix) > 0:
                                for pfixs in all_postfix:
                                    if type(pfixs) == list:
                                        pfixs.append(p)
                                    else:
                                        pfixs = [p, pfixs]
                                    iw = InformalWord(lemma=lemma, postfixs=pfixs, pos=pos)
                                    possible_combinatios.append(iw)
                        elif len(pp) == 0:
                            iw = InformalWord(lemma=lemma, postfixs=[p], pos=pos)
                            possible_combinatios.append(iw)

        return possible_combinatios

    def match_prefixs(self, word, pos):
        possible_combinatios = []
        for i in range(len(OneShotTransformer.pres)):
            for p in OneShotTransformer.pres[i]:
                if word.startswith(p.word):
                    lemma = word[len(p.word):]
                    prefixs = [p]
                    iw = InformalWord(lemma=lemma, prefixs=prefixs, postfixs=[], pos=pos)
                    possible_combinatios.append(iw)
                    return possible_combinatios
        return []

    def parse_word(self, iword):
        parsed_resp = []
        prefixed_word = self.match_prefixs(iword.lemma,pos=iword.pos)
        prefixed_word.append(iword)
        parsed_resp.extend(prefixed_word)
        for pw in prefixed_word:
            postfixed_iwords = self.match_postfixs(pw.lemma,pos=iword.pos)
            for piw in postfixed_iwords:
                piw.prefixs = pw.prefixs
                parsed_resp.append(piw)
        return parsed_resp

    def is_seqs_of_verbs(self, txt):
        words = txt.split()
        if len(words) < 2:
            return False
        for w in words:
            formal_verb = self.verb_to_formal_func(w)
            if formal_verb is None:
                return False
        if words[-1] in ['است', 'هست']:
            return False
        return True

    def filter_results(self, word_lemmas):
        return list(filter(lambda wl: len(wl[0])>0 and wl[0][-1] != '‌' and not self.is_seqs_of_verbs(wl[0]), word_lemmas))

    def concatenate_formal_words(self, pre, next):
        """
        خانه +‌ ت -> خانه‌ات
        دیگر + ای -> دیگری
        """
        nim_fasele = '‌'
        not_connect_chars = ['ا', 'د', 'ذ', 'ر', 'ز', 'ژ', 'و']
        if len(pre) < 1 :
            return next
        if pre[-1] in ['ه'] and next in ['م', 'ت', 'ش']:
            return pre + nim_fasele + 'ا' + next
        if pre[-1] == 'ا'and next.split() and next.split()[0] in ['م', 'ت', 'ش', 'مان', 'تان', 'شان']:
            return pre + nim_fasele + 'ی' + next
        if pre[-1] not in ['ه'] and next in ['ای']:
            return pre + 'ی'
        out = pre  + next
        if pre[-1] not in not_connect_chars or next.startswith('ها') or pre[-1] in ['ه'] or pre + nim_fasele + next in self.vocab:
            out = pre + nim_fasele + next
        if self.verb_to_formal_func(next):
            out = pre + ' ' + next
        return out

    def handle_nim_fasele_words(self, word, pos):
        def extract_lemma_nim_fasele_words(word, pos):
            formal_prefixs = []
            formal_postfixs = []
            prefixs = {'اون': 'آن', 'همون': 'همین'}
            postfixs = self.postfix_mapper
            tokens = word.split('‌')
            index = 0
            for i in range(len(tokens)):
                index = i
                if tokens[i] not in prefixs:
                    break
                else:
                    formal_prefixs.append(prefixs[tokens[i]])

            for i in range(len(tokens), index, -1):
                current_tok = '‌'.join(tokens[index:i])
                if current_tok in self.vocab or tokens[i - 1] not in postfixs:
                    return formal_prefixs, current_tok, formal_postfixs
                else:
                    formal_postfixs.append(postfixs[tokens[i - 1]])
            return formal_prefixs, current_tok, formal_postfixs
        nim_fasele = '‌'
        candidates = []
        formal_word = ''
        verbs = self.verb_to_formal_func(word)
        if verbs:
            return [(v, v) for v in verbs]
        all_candidates = set()
        # lemma
        formal_prefixs, lemma, formal_postfixs = extract_lemma_nim_fasele_words(word, pos)
        word_lemmas = self.transform(lemma, pos, ignore_nim_fasele=True)
        # lemma with postfix should len=1
        one_token_words = [wl for wl in word_lemmas if len(wl[0].split()) == 1]
        if formal_postfixs and one_token_words:
            all_formal_lemma_candidates = one_token_words
        else:
            all_formal_lemma_candidates = word_lemmas
        if not all_formal_lemma_candidates:
                if formal_postfixs or formal_prefixs:
                    all_formal_lemma_candidates = [(lemma, lemma)]
                else:
                    tokens = lemma.split(nim_fasele)
                    if all(self.transform(t, None, ignore_nim_fasele=True) for t in tokens):
                        w = ' '.join(tokens)
                        return [(w, w)]
                    else:
                        return []
        for cnd_lemma, formal_word_lemma in all_formal_lemma_candidates:
            formal_word = ''
            toks = formal_prefixs + [cnd_lemma] + formal_postfixs
            for index, t in enumerate(toks):
                formal_word = self.concatenate_formal_words(formal_word, t)
            all_candidates.add((formal_word, formal_word_lemma))
            #     if t in self.postfix_mapper:
            #         formal_t = self.postfix_mapper[t]
            #     else:
            #         transform_outputs = self.transform(t, pos)
            #         if not transform_outputs:
            #             formal_t = t
            #         else:
            #             one_word_outputs = [ft for ft in transform_outputs if len(ft.split()) == 1]
            #             if one_word_outputs:
            #                 if t in one_word_outputs:
            #                     formal_t = t
            #                 else:
            #                     formal_t = one_word_outputs[0]
            #             else:
            #                 formal_t = transform_outputs.pop()
        return all_candidates



    def transform(self, word, pos, ignore_nim_fasele=False):
        """ignore emoji , punctuation, numbers"""
        ignore_chars = '.1234567890!@#$%^&*()_+۱۲۳۴۵۶۷۸۹÷؟×−+?><}،,{":' + string.ascii_lowercase + string.ascii_uppercase
        if any(ic in word for ic in ignore_chars) or utils.if_emoji(word):
            return [(word, word)]
        """handle nim fasele"""
        nim_fasele = '‌'
        if not ignore_nim_fasele and nim_fasele in word:
            return self.handle_nim_fasele_words(word, pos)
        # pass ignore words and accept as correct informal word!
        if word in self.ignore_words and not word in self.mapper:
            return [(word, word)]
        formal_prefix_obj = self.prefix_obj(word)
        if formal_prefix_obj:
            return formal_prefix_obj
        iword = InformalWord(lemma=word, pos=pos)
        expanded_candidates = []
        candidates = self.parse_word(iword)
        #just verbs
        if any(c.is_verb for c in candidates):
            candidates = [c for c in candidates if c.is_verb]
        for cnd in candidates:
            expanded_candidates.extend(self.get_expand(cnd))
        word_iwords = []
        for ec in expanded_candidates:
            word_iwords.extend(self.to_formals(ec))
        if any(f[1] and (f[1].is_mapper or f[1].is_verb) for f in word_iwords if f[1] is not None):
            word_iwords = [f for f in word_iwords if f[1] and (f[1].is_mapper or f[1].is_verb)]
        # else:
        word_lemmas_set = [(w, iword.lemma) for w, iword in word_iwords if iword is not None]
        word_lemmas_set = set(word_lemmas_set)
        out = self.filter_results(word_lemmas_set)
        # if type(out) == str:
        #     out = [out]
        # out = set(out)
        return out

if __name__ == '__main__':
    transformer = OneShotTransformer(None, None, None)
    candidates =  transformer.match_postfixs('کارامم')
    print(candidates)