from functools import reduce import itertools import json import re import string import pandas as pd from hazm import Normalizer, WordTokenizer normalizer = Normalizer() tokenizer = WordTokenizer(separate_emoji=True) def seprate_emoji_string(txt): try: oRes = re.compile(u'([' u'\U0001F300-\U0001F64F' u'\U0001F680-\U0001F6FF' u'\u2600-\u26FF\u2700-\u27BF]+)', re.UNICODE) except re.error: oRes = re.compile(u'((' u'\ud83c[\udf00-\udfff]|' u'\ud83d[\udc00-\ude4f\ude80-\udeff]|' u'[\u2600-\u26FF\u2700-\u27BF])+)', re.UNICODE) return oRes.sub(r' \1 ', txt) def cleanify(txt): txt = txt.strip() txt = re.sub('\s+', ' ', txt) txt = re.sub('\u200f', '', txt) txt = re.sub('‌+', '‌', txt) txt = re.sub('‌ ', ' ', txt) txt = re.sub(' ‌', ' ', txt) txt = normalizer.normalize(txt) txt = seprate_emoji_string(txt) txt = ' '.join(tokenizer.tokenize(txt)) return txt def clean_text_for_lm(txt): ignore_chars = '.1234567890!@#$%^&*()_+۱۲۳۴۵۶۷۸۹÷؟×−+?><}،,{":' + string.ascii_lowercase + string.ascii_uppercase tokens = txt.split() clean_tokens = [t for t in tokens if not (any(ic in t for ic in ignore_chars) or if_emoji(t))] return ' '.join(clean_tokens) def add_to_mapper(mapping_list): print(len(mapping_list)) df = pd.read_csv('resources/mapper.csv', delimiter=',', index_col=None) print(df.columns) for item in mapping_list: df = df.append({'formal': item[1], 'informal': item[0]}, ignore_index=True) df.to_csv('resources/mapper.csv', index=False) def extract_non_convertable_words(corpus_addr, tokenizer, normalizer, transformer, output_addr, vocab): f = open(corpus_addr) non_convertables = {} seen_words = set() nim_fasele = '‌' for i, line in enumerate(f): print(i) # if i > 500: # break line = normalizer.normalize(line) tokens = tokenizer.tokenize(line) for t in tokens: # if nim_fasele in t: # print(t) if t in seen_words: if t in non_convertables: non_convertables[t] += 1 else: candidates = transformer.transform(t, None) # if not candidates and any(t.startswith(pre) for pre in ['از', 'در', 'چند', 'هر', 'هیچ', 'هم', 'با', 'بی', 'تا', 'و']): # print(t) if not candidates: non_convertables[t] = 1 seen_words.add(t) words_count = sorted([(word, count) for word, count in non_convertables.items()], key=lambda item: item[1], reverse=True) words_count = [str(word) + ' ########### ' + str(count) for (word, count) in words_count] with open(output_addr, 'w+') as f: f.write('\n'.join(words_count)) def generate_irrgular_informal_verbs(): """ برمیگرده میوفته برمیداره برمیگردونه درمیاره ایستادن نمیومد وامیسته اومد نیومد اومدی نیومدی میومدی نیومده یومد میومده """ mapping_verbs = [] past_ends = ['م', 'ی', 'ه', 'یم', 'ین', 'ید', 'ند', '', 'ن'] neg = ['ن', ''] pre = ['می', 'ب'] pre_verbs = [('بر', 'دار'), ('در', 'یار'), ('وا', 'ست'), ('بر', 'گرد'), ('ور', 'دار'), ('بر', 'گشت')] extras = ['ن', 'نمی', 'می'] mapper = {'ه':'د', 'ن': 'ند', 'ین': 'ید', 'ور': 'بر', 'ست':'ایست', 'وا':'', 'یار':'آور'} for item in pre_verbs: for pe in past_ends: for ex in extras: p_end = pe item0 = item[0] item1 = item[1] inf = item0 + ex + item1 + p_end inf = inf.replace('یی', 'ی') if item0 in mapper: item0 = mapper[item0] if item1 in mapper: item1 = mapper[item1] if p_end in mapper: p_end = mapper[p_end] formal = item0 + ex + item1 + p_end formal = formal.replace('می', 'می‌') formal = formal.replace('نآ', 'نیا') mapping_verbs.append([formal, inf]) bons = ['یومد', 'یوفت'] v_mapper = {'یومد': 'یامد', 'یوفت': 'افت'} verbs = itertools.product(neg, pre, bons, past_ends) for v in verbs: if v[0] == 'ن' and v[1] == 'ب' or (v[2] == 'یومد' and v[1] == 'ب'): continue inf = v[0] + v[1] + v[2] + v[3] inf = inf.replace('یی', 'ی') pe = v[3] if pe in mapper: pe = mapper[pe] formal = v[0] + v[1] + '‌' + v_mapper[v[2]] + pe formal = formal.replace('ی‌ی', 'ی') formal = formal.replace('یا', 'ی‌آ') formal = formal.replace('دد', 'ده') formal = formal.replace('ب‌ا', 'بی') mapping_verbs.append([formal, inf]) add_to_mapper(mapping_verbs) def load_vocab(vocab_addr='resources/words.dat'): vocab = {} with open(vocab_addr, 'r', encoding='utf-8') as f: for line in f: try: word, freq, p_tags = line.strip().split('\t') vocab[word] = {'freq': freq, 'tags': p_tags} except: word = line.strip() vocab[word] = {'freq': 1, 'tags': 'NUM'} return vocab def if_connect(word1, word2): not_connect_chars = ['ا', 'د', 'ذ', 'ر', 'ز', 'ژ', 'و'] if any(w =='' for w in [word1, word2]) or word1[-1] in not_connect_chars: return True return False def split_conj_words(word, conjs): candidates = set() sorted_conjs = sorted(conjs, key=lambda x: len(x), reverse=True) for c in sorted_conjs: indx = word.find(c) if indx != -1 and indx in [0, len(word)-1]: pre_w = word[:indx] next_w = word[indx+len(c) :] if if_connect(pre_w, c) and if_connect(c, next_w): cnd = ' '.join([pre_w, c, next_w]) cnd = cnd.strip() candidates.add(cnd) return list(candidates) def is_formal_prefixed(word, vocab): not_connect_chars = ['ا', 'د', 'ذ', 'ر', 'ز', 'ژ', 'و'] nim_fasele = '‌' m1 = re.match('(.+)های(م|ت|ش|مان|تان|شان)?$', word) m2 = re.match('(.+[ا|و|ی])ی(م|ت|ش|مان|تان|شان)$', word) m3 = re.match('(.+[^ا^و^ی])(م|ت|ش|مان|تان|شان)$', word) m4 = re.match('(.+)(ها)$', word) m5 = re.match('(.+[ه|ی]‌)(اش|ام|ات)$', word) if m3 or m2: prefix_word = list(filter(lambda m: m is not None, [m3, m2]))[0].group(1) if prefix_word in vocab: return True m_fired = list(filter(lambda m: m is not None, [m1, m4, m5])) if len(m_fired) > 0: # print(word, m_fired[0].groups()) prefix_word = m_fired[0].group(1) if prefix_word[-1] != nim_fasele and prefix_word[-1] not in not_connect_chars: return False if prefix_word[-1] == nim_fasele and not (prefix_word[:-1] in vocab): return False if prefix_word[-1] != nim_fasele and not (prefix_word in vocab): return False return True return False def spelling_similairty(word): all_possible = [] possible_repeated = get_possible_repeated_word(word) all_possible = possible_repeated if word in all_possible: all_possible.remove(word) return all_possible def add_nim_alef_hat_dictionary(vocab): word_with_hat = filter(lambda w: 'آ' in w, vocab) word_with_nim = filter(lambda w: '‌' in w, vocab) mapper1 = {w.replace('آ', 'ا').replace('‌', ''): w for w in word_with_hat} mapper2 = {w.replace('‌', ''): w for w in word_with_nim} mapper1.update(mapper2) return mapper1 def generate_spell_mapper(vocab): hat = 'آ' tanvin = 'اً' nim = '‌' hamzeh = 'أ' hamzeh_y = 'ئ' sp_mapper = {hamzeh_y: ['ی'], hat: ['ا'], tanvin: ['ن', 'ا'], nim:['', ' '], hamzeh:['ا', '']} special_chars = [hat, tanvin, nim, hamzeh] out = {} for word in vocab: p_words = [word.replace(sp, sp_alt) for sp in special_chars for sp_alt in sp_mapper[sp]] spell_errors = [] p_words = list(set(p_words) - set([word])) for pw in p_words: if pw in out: out[pw].add(word) else: out[pw] = {word} out = {w: list(out[w]) for w in out} with open('spell_checker_mapper.json', 'w+', encoding='utf-8') as f: json.dump(out, f, ensure_ascii=False, indent=1) def create_mapper_tanvin_hamze_hat_nim_fasele(): mapper = {} hats_word = open('resources/spell/words_with_hat.txt').read().splitlines() nim_words = open('resources/spell/words_with_nim.txt').read().splitlines() tanvin_words = open('resources/spell/words_with_tanvin.txt').read().splitlines() hat_ch = 'آ' nim_fasele = '‌' for w in hats_word: w_without_h = w.replace(hat_ch, 'ا') mapper[w_without_h] = w for w in nim_words: w_without_nim = w.remove(nim_fasele) mapper[w_without_nim] = w w_space_instead_nim = w.replace(nim_fasele, ' ') mapper[w_space_instead_nim] = w def extract_lemma_nim_fasele_words(word, vocab): prefixs = ['اون'] postfixs = {'ست': 'است', 'هام':'هایم', 'ام':'ام', 'ها':'ها', 'هامون':'هایمان', 'ترین': 'ترین', 'هایشان':'هایشان'} tokens = word.split('‌') index = 0 for i in range(len(tokens)): index = i if tokens[i] not in prefixs: break for i in range(len(tokens), 0, -1): current_tok = '‌'.join(tokens[index:i]) if current_tok in vocab or tokens[i-1] not in postfixs: return current_tok def if_emoji(text): # Wide UCS-4 build try: oRes = re.compile(u'([' u'\U0001F300-\U0001F64F' u'\U0001F680-\U0001F6FF' u'\u2600-\u26FF\u2700-\u27BF]+)', re.UNICODE) except re.error: # Narrow UCS-2 build oRes = re.compile(u'((' u'\ud83c[\udf00-\udfff]|' u'\ud83d[\udc00-\ude4f\ude80-\udeff]|' u'[\u2600-\u26FF\u2700-\u27BF])+)', re.UNICODE) return oRes.findall(text) def powerset(lst): return reduce(lambda result, x: result + [subset + [x] for subset in result], lst, [[]])