|
from functools import reduce |
|
import itertools |
|
import json |
|
import re |
|
import string |
|
import pandas as pd |
|
from hazm import Normalizer, WordTokenizer |
|
|
|
normalizer = Normalizer() |
|
tokenizer = WordTokenizer(separate_emoji=True) |
|
|
|
|
|
def seprate_emoji_string(txt): |
|
try: |
|
oRes = re.compile(u'([' |
|
u'\U0001F300-\U0001F64F' |
|
u'\U0001F680-\U0001F6FF' |
|
u'\u2600-\u26FF\u2700-\u27BF]+)', |
|
re.UNICODE) |
|
except re.error: |
|
oRes = re.compile(u'((' |
|
u'\ud83c[\udf00-\udfff]|' |
|
u'\ud83d[\udc00-\ude4f\ude80-\udeff]|' |
|
u'[\u2600-\u26FF\u2700-\u27BF])+)', |
|
re.UNICODE) |
|
|
|
return oRes.sub(r' \1 ', txt) |
|
|
|
def cleanify(txt): |
|
txt = txt.strip() |
|
txt = re.sub('\s+', ' ', txt) |
|
txt = re.sub('\u200f', '', txt) |
|
txt = re.sub('+', '', txt) |
|
txt = re.sub(' ', ' ', txt) |
|
txt = re.sub(' ', ' ', txt) |
|
txt = normalizer.normalize(txt) |
|
txt = seprate_emoji_string(txt) |
|
txt = ' '.join(tokenizer.tokenize(txt)) |
|
return txt |
|
|
|
|
|
|
|
|
|
def clean_text_for_lm(txt): |
|
ignore_chars = '.1234567890!@#$%^&*()_+۱۲۳۴۵۶۷۸۹÷؟×−+?><}،,{":' + string.ascii_lowercase + string.ascii_uppercase |
|
tokens = txt.split() |
|
clean_tokens = [t for t in tokens if not (any(ic in t for ic in ignore_chars) or if_emoji(t))] |
|
return ' '.join(clean_tokens) |
|
|
|
|
|
def add_to_mapper(mapping_list): |
|
print(len(mapping_list)) |
|
df = pd.read_csv('resources/mapper.csv', delimiter=',', index_col=None) |
|
print(df.columns) |
|
for item in mapping_list: |
|
df = df.append({'formal': item[1], 'informal': item[0]}, ignore_index=True) |
|
df.to_csv('resources/mapper.csv', index=False) |
|
|
|
|
|
def extract_non_convertable_words(corpus_addr, tokenizer, normalizer, transformer, output_addr, vocab): |
|
f = open(corpus_addr) |
|
non_convertables = {} |
|
seen_words = set() |
|
nim_fasele = '' |
|
for i, line in enumerate(f): |
|
print(i) |
|
|
|
|
|
line = normalizer.normalize(line) |
|
tokens = tokenizer.tokenize(line) |
|
for t in tokens: |
|
|
|
|
|
if t in seen_words: |
|
if t in non_convertables: |
|
non_convertables[t] += 1 |
|
else: |
|
candidates = transformer.transform(t, None) |
|
|
|
|
|
if not candidates: |
|
non_convertables[t] = 1 |
|
seen_words.add(t) |
|
words_count = sorted([(word, count) for word, count in non_convertables.items()], key=lambda item: item[1], reverse=True) |
|
words_count = [str(word) + ' ########### ' + str(count) for (word, count) in words_count] |
|
with open(output_addr, 'w+') as f: |
|
f.write('\n'.join(words_count)) |
|
|
|
|
|
def generate_irrgular_informal_verbs(): |
|
""" |
|
برمیگرده میوفته برمیداره برمیگردونه درمیاره ایستادن نمیومد وامیسته |
|
|
|
اومد |
|
نیومد |
|
اومدی |
|
نیومدی |
|
میومدی |
|
نیومده |
|
یومد |
|
میومده |
|
""" |
|
|
|
mapping_verbs = [] |
|
past_ends = ['م', 'ی', 'ه', 'یم', 'ین', 'ید', 'ند', '', 'ن'] |
|
neg = ['ن', ''] |
|
pre = ['می', 'ب'] |
|
pre_verbs = [('بر', 'دار'), ('در', 'یار'), ('وا', 'ست'), ('بر', 'گرد'), ('ور', 'دار'), ('بر', 'گشت')] |
|
extras = ['ن', 'نمی', 'می'] |
|
mapper = {'ه':'د', 'ن': 'ند', 'ین': 'ید', 'ور': 'بر', 'ست':'ایست', 'وا':'', 'یار':'آور'} |
|
for item in pre_verbs: |
|
for pe in past_ends: |
|
for ex in extras: |
|
p_end = pe |
|
item0 = item[0] |
|
item1 = item[1] |
|
inf = item0 + ex + item1 + p_end |
|
inf = inf.replace('یی', 'ی') |
|
if item0 in mapper: |
|
item0 = mapper[item0] |
|
if item1 in mapper: |
|
item1 = mapper[item1] |
|
if p_end in mapper: |
|
p_end = mapper[p_end] |
|
formal = item0 + ex + item1 + p_end |
|
formal = formal.replace('می', 'می') |
|
formal = formal.replace('نآ', 'نیا') |
|
mapping_verbs.append([formal, inf]) |
|
bons = ['یومد', 'یوفت'] |
|
v_mapper = {'یومد': 'یامد', 'یوفت': 'افت'} |
|
verbs = itertools.product(neg, pre, bons, past_ends) |
|
for v in verbs: |
|
if v[0] == 'ن' and v[1] == 'ب' or (v[2] == 'یومد' and v[1] == 'ب'): |
|
continue |
|
inf = v[0] + v[1] + v[2] + v[3] |
|
inf = inf.replace('یی', 'ی') |
|
pe = v[3] |
|
if pe in mapper: |
|
pe = mapper[pe] |
|
formal = v[0] + v[1] + '' + v_mapper[v[2]] + pe |
|
formal = formal.replace('یی', 'ی') |
|
formal = formal.replace('یا', 'یآ') |
|
formal = formal.replace('دد', 'ده') |
|
formal = formal.replace('با', 'بی') |
|
mapping_verbs.append([formal, inf]) |
|
add_to_mapper(mapping_verbs) |
|
|
|
|
|
|
|
def load_vocab(vocab_addr='resources/words.dat'): |
|
vocab = {} |
|
with open(vocab_addr, 'r', encoding='utf-8') as f: |
|
for line in f: |
|
try: |
|
word, freq, p_tags = line.strip().split('\t') |
|
vocab[word] = {'freq': freq, 'tags': p_tags} |
|
except: |
|
word = line.strip() |
|
vocab[word] = {'freq': 1, 'tags': 'NUM'} |
|
return vocab |
|
|
|
def if_connect(word1, word2): |
|
not_connect_chars = ['ا', 'د', 'ذ', 'ر', 'ز', 'ژ', 'و'] |
|
if any(w =='' for w in [word1, word2]) or word1[-1] in not_connect_chars: |
|
return True |
|
return False |
|
def split_conj_words(word, conjs): |
|
candidates = set() |
|
sorted_conjs = sorted(conjs, key=lambda x: len(x), reverse=True) |
|
for c in sorted_conjs: |
|
indx = word.find(c) |
|
if indx != -1 and indx in [0, len(word)-1]: |
|
pre_w = word[:indx] |
|
next_w = word[indx+len(c) :] |
|
if if_connect(pre_w, c) and if_connect(c, next_w): |
|
cnd = ' '.join([pre_w, c, next_w]) |
|
cnd = cnd.strip() |
|
candidates.add(cnd) |
|
return list(candidates) |
|
|
|
|
|
def is_formal_prefixed(word, vocab): |
|
not_connect_chars = ['ا', 'د', 'ذ', 'ر', 'ز', 'ژ', 'و'] |
|
nim_fasele = '' |
|
m1 = re.match('(.+)های(م|ت|ش|مان|تان|شان)?$', word) |
|
m2 = re.match('(.+[ا|و|ی])ی(م|ت|ش|مان|تان|شان)$', word) |
|
m3 = re.match('(.+[^ا^و^ی])(م|ت|ش|مان|تان|شان)$', word) |
|
m4 = re.match('(.+)(ها)$', word) |
|
m5 = re.match('(.+[ه|ی])(اش|ام|ات)$', word) |
|
if m3 or m2: |
|
prefix_word = list(filter(lambda m: m is not None, [m3, m2]))[0].group(1) |
|
if prefix_word in vocab: |
|
return True |
|
m_fired = list(filter(lambda m: m is not None, [m1, m4, m5])) |
|
if len(m_fired) > 0: |
|
|
|
prefix_word = m_fired[0].group(1) |
|
if prefix_word[-1] != nim_fasele and prefix_word[-1] not in not_connect_chars: |
|
return False |
|
if prefix_word[-1] == nim_fasele and not (prefix_word[:-1] in vocab): |
|
return False |
|
if prefix_word[-1] != nim_fasele and not (prefix_word in vocab): |
|
return False |
|
return True |
|
return False |
|
|
|
|
|
def spelling_similairty(word): |
|
all_possible = [] |
|
possible_repeated = get_possible_repeated_word(word) |
|
all_possible = possible_repeated |
|
if word in all_possible: |
|
all_possible.remove(word) |
|
return all_possible |
|
|
|
def add_nim_alef_hat_dictionary(vocab): |
|
word_with_hat = filter(lambda w: 'آ' in w, vocab) |
|
word_with_nim = filter(lambda w: '' in w, vocab) |
|
mapper1 = {w.replace('آ', 'ا').replace('', ''): w for w in word_with_hat} |
|
mapper2 = {w.replace('', ''): w for w in word_with_nim} |
|
mapper1.update(mapper2) |
|
return mapper1 |
|
|
|
def generate_spell_mapper(vocab): |
|
hat = 'آ' |
|
tanvin = 'اً' |
|
nim = '' |
|
hamzeh = 'أ' |
|
hamzeh_y = 'ئ' |
|
sp_mapper = {hamzeh_y: ['ی'], hat: ['ا'], tanvin: ['ن', 'ا'], nim:['', ' '], hamzeh:['ا', '']} |
|
special_chars = [hat, tanvin, nim, hamzeh] |
|
out = {} |
|
for word in vocab: |
|
p_words = [word.replace(sp, sp_alt) for sp in special_chars for sp_alt in sp_mapper[sp]] |
|
spell_errors = [] |
|
p_words = list(set(p_words) - set([word])) |
|
for pw in p_words: |
|
if pw in out: |
|
out[pw].add(word) |
|
else: |
|
out[pw] = {word} |
|
out = {w: list(out[w]) for w in out} |
|
with open('spell_checker_mapper.json', 'w+', encoding='utf-8') as f: |
|
json.dump(out, f, ensure_ascii=False, indent=1) |
|
|
|
|
|
|
|
def create_mapper_tanvin_hamze_hat_nim_fasele(): |
|
mapper = {} |
|
hats_word = open('resources/spell/words_with_hat.txt').read().splitlines() |
|
nim_words = open('resources/spell/words_with_nim.txt').read().splitlines() |
|
tanvin_words = open('resources/spell/words_with_tanvin.txt').read().splitlines() |
|
hat_ch = 'آ' |
|
nim_fasele = '' |
|
for w in hats_word: |
|
w_without_h = w.replace(hat_ch, 'ا') |
|
mapper[w_without_h] = w |
|
for w in nim_words: |
|
w_without_nim = w.remove(nim_fasele) |
|
mapper[w_without_nim] = w |
|
w_space_instead_nim = w.replace(nim_fasele, ' ') |
|
mapper[w_space_instead_nim] = w |
|
|
|
def extract_lemma_nim_fasele_words(word, vocab): |
|
prefixs = ['اون'] |
|
postfixs = {'ست': 'است', 'هام':'هایم', 'ام':'ام', 'ها':'ها', 'هامون':'هایمان', 'ترین': 'ترین', 'هایشان':'هایشان'} |
|
tokens = word.split('') |
|
index = 0 |
|
for i in range(len(tokens)): |
|
index = i |
|
if tokens[i] not in prefixs: |
|
break |
|
|
|
for i in range(len(tokens), 0, -1): |
|
current_tok = ''.join(tokens[index:i]) |
|
if current_tok in vocab or tokens[i-1] not in postfixs: |
|
return current_tok |
|
|
|
|
|
def if_emoji(text): |
|
|
|
try: |
|
oRes = re.compile(u'([' |
|
u'\U0001F300-\U0001F64F' |
|
u'\U0001F680-\U0001F6FF' |
|
u'\u2600-\u26FF\u2700-\u27BF]+)', |
|
re.UNICODE) |
|
|
|
except re.error: |
|
|
|
oRes = re.compile(u'((' |
|
u'\ud83c[\udf00-\udfff]|' |
|
u'\ud83d[\udc00-\ude4f\ude80-\udeff]|' |
|
u'[\u2600-\u26FF\u2700-\u27BF])+)', |
|
re.UNICODE) |
|
|
|
return oRes.findall(text) |
|
|
|
|
|
def powerset(lst): |
|
return reduce(lambda result, x: result + [subset + [x] for subset in result], |
|
lst, [[]]) |