from functools import reduce |
import itertools |
import json |
import re |
import string |
import pandas as pd |
from hazm import Normalizer, WordTokenizer |
normalizer = Normalizer() |
tokenizer = WordTokenizer(separate_emoji=True) |
def seprate_emoji_string(txt): |
try: |
oRes = re.compile(u'([' |
u'\U0001F300-\U0001F64F' |
u'\U0001F680-\U0001F6FF' |
u'\u2600-\u26FF\u2700-\u27BF]+)', |
except re.error: |
oRes = re.compile(u'((' |
u'\ud83c[\udf00-\udfff]|' |
u'\ud83d[\udc00-\ude4f\ude80-\udeff]|' |
u'[\u2600-\u26FF\u2700-\u27BF])+)', |
return oRes.sub(r' \1 ', txt) |
def cleanify(txt): |
txt = txt.strip() |
txt = re.sub('\s+', ' ', txt) |
txt = re.sub('\u200f', '', txt) |
txt = re.sub('+', '', txt) |
txt = re.sub(' ', ' ', txt) |
txt = re.sub(' ', ' ', txt) |
txt = normalizer.normalize(txt) |
txt = seprate_emoji_string(txt) |
txt = ' '.join(tokenizer.tokenize(txt)) |
return txt |
def clean_text_for_lm(txt): |
ignore_chars = '.1234567890!@#$%^&*()_+۱۲۳۴۵۶۷۸۹÷؟×−+?><}،,{":' + string.ascii_lowercase + string.ascii_uppercase |
tokens = txt.split() |
clean_tokens = [t for t in tokens if not (any(ic in t for ic in ignore_chars) or if_emoji(t))] |
return ' '.join(clean_tokens) |
def add_to_mapper(mapping_list): |
print(len(mapping_list)) |
df = pd.read_csv('resources/mapper.csv', delimiter=',', index_col=None) |
print(df.columns) |
for item in mapping_list: |
df = df.append({'formal': item[1], 'informal': item[0]}, ignore_index=True) |
df.to_csv('resources/mapper.csv', index=False) |
def extract_non_convertable_words(corpus_addr, tokenizer, normalizer, transformer, output_addr, vocab): |
f = open(corpus_addr) |
non_convertables = {} |
seen_words = set() |
nim_fasele = '' |
for i, line in enumerate(f): |
print(i) |
line = normalizer.normalize(line) |
tokens = tokenizer.tokenize(line) |
for t in tokens: |
if t in seen_words: |
if t in non_convertables: |
non_convertables[t] += 1 |
else: |
candidates = transformer.transform(t, None) |
if not candidates: |
non_convertables[t] = 1 |
seen_words.add(t) |
words_count = sorted([(word, count) for word, count in non_convertables.items()], key=lambda item: item[1], reverse=True) |
words_count = [str(word) + ' ########### ' + str(count) for (word, count) in words_count] |
with open(output_addr, 'w+') as f: |
f.write('\n'.join(words_count)) |
def generate_irrgular_informal_verbs(): |
""" |
برمیگرده میوفته برمیداره برمیگردونه درمیاره ایستادن نمیومد وامیسته |
اومد |
نیومد |
اومدی |
نیومدی |
میومدی |
نیومده |
یومد |
میومده |
""" |
mapping_verbs = [] |
past_ends = ['م', 'ی', 'ه', 'یم', 'ین', 'ید', 'ند', '', 'ن'] |
neg = ['ن', ''] |
pre = ['می', 'ب'] |
pre_verbs = [('بر', 'دار'), ('در', 'یار'), ('وا', 'ست'), ('بر', 'گرد'), ('ور', 'دار'), ('بر', 'گشت')] |
extras = ['ن', 'نمی', 'می'] |
mapper = {'ه':'د', 'ن': 'ند', 'ین': 'ید', 'ور': 'بر', 'ست':'ایست', 'وا':'', 'یار':'آور'} |
for item in pre_verbs: |
for pe in past_ends: |
for ex in extras: |
p_end = pe |
item0 = item[0] |
item1 = item[1] |
inf = item0 + ex + item1 + p_end |
inf = inf.replace('یی', 'ی') |
if item0 in mapper: |
item0 = mapper[item0] |
if item1 in mapper: |
item1 = mapper[item1] |
if p_end in mapper: |
p_end = mapper[p_end] |
formal = item0 + ex + item1 + p_end |
formal = formal.replace('می', 'می') |
formal = formal.replace('نآ', 'نیا') |
mapping_verbs.append([formal, inf]) |
bons = ['یومد', 'یوفت'] |
v_mapper = {'یومد': 'یامد', 'یوفت': 'افت'} |
verbs = itertools.product(neg, pre, bons, past_ends) |
for v in verbs: |
if v[0] == 'ن' and v[1] == 'ب' or (v[2] == 'یومد' and v[1] == 'ب'): |
continue |
inf = v[0] + v[1] + v[2] + v[3] |
inf = inf.replace('یی', 'ی') |
pe = v[3] |
if pe in mapper: |
pe = mapper[pe] |
formal = v[0] + v[1] + '' + v_mapper[v[2]] + pe |
formal = formal.replace('یی', 'ی') |
formal = formal.replace('یا', 'یآ') |
formal = formal.replace('دد', 'ده') |
formal = formal.replace('با', 'بی') |
mapping_verbs.append([formal, inf]) |
add_to_mapper(mapping_verbs) |
def load_vocab(vocab_addr='resources/words.dat'): |
vocab = {} |
with open(vocab_addr, 'r', encoding='utf-8') as f: |
for line in f: |
try: |
word, freq, p_tags = line.strip().split('\t') |
vocab[word] = {'freq': freq, 'tags': p_tags} |
except: |
word = line.strip() |
vocab[word] = {'freq': 1, 'tags': 'NUM'} |
return vocab |
def if_connect(word1, word2): |
not_connect_chars = ['ا', 'د', 'ذ', 'ر', 'ز', 'ژ', 'و'] |
if any(w =='' for w in [word1, word2]) or word1[-1] in not_connect_chars: |
return True |
return False |
def split_conj_words(word, conjs): |
candidates = set() |
sorted_conjs = sorted(conjs, key=lambda x: len(x), reverse=True) |
for c in sorted_conjs: |
indx = word.find(c) |
if indx != -1 and indx in [0, len(word)-1]: |
pre_w = word[:indx] |
next_w = word[indx+len(c) :] |
if if_connect(pre_w, c) and if_connect(c, next_w): |
cnd = ' '.join([pre_w, c, next_w]) |
cnd = cnd.strip() |
candidates.add(cnd) |
return list(candidates) |
def is_formal_prefixed(word, vocab): |
not_connect_chars = ['ا', 'د', 'ذ', 'ر', 'ز', 'ژ', 'و'] |
nim_fasele = '' |
m1 = re.match('(.+)های(م|ت|ش|مان|تان|شان)?$', word) |
m2 = re.match('(.+[ا|و|ی])ی(م|ت|ش|مان|تان|شان)$', word) |
m3 = re.match('(.+[^ا^و^ی])(م|ت|ش|مان|تان|شان)$', word) |
m4 = re.match('(.+)(ها)$', word) |
m5 = re.match('(.+[ه|ی])(اش|ام|ات)$', word) |
if m3 or m2: |
prefix_word = list(filter(lambda m: m is not None, [m3, m2]))[0].group(1) |
if prefix_word in vocab: |
return True |
m_fired = list(filter(lambda m: m is not None, [m1, m4, m5])) |
if len(m_fired) > 0: |
prefix_word = m_fired[0].group(1) |
if prefix_word[-1] != nim_fasele and prefix_word[-1] not in not_connect_chars: |
return False |
if prefix_word[-1] == nim_fasele and not (prefix_word[:-1] in vocab): |
return False |
if prefix_word[-1] != nim_fasele and not (prefix_word in vocab): |
return False |
return True |
return False |
def spelling_similairty(word): |
all_possible = [] |
possible_repeated = get_possible_repeated_word(word) |
all_possible = possible_repeated |
if word in all_possible: |
all_possible.remove(word) |
return all_possible |
def add_nim_alef_hat_dictionary(vocab): |
word_with_hat = filter(lambda w: 'آ' in w, vocab) |
word_with_nim = filter(lambda w: '' in w, vocab) |
mapper1 = {w.replace('آ', 'ا').replace('', ''): w for w in word_with_hat} |
mapper2 = {w.replace('', ''): w for w in word_with_nim} |
mapper1.update(mapper2) |
return mapper1 |
def generate_spell_mapper(vocab): |
hat = 'آ' |
tanvin = 'اً' |
nim = '' |
hamzeh = 'أ' |
hamzeh_y = 'ئ' |
sp_mapper = {hamzeh_y: ['ی'], hat: ['ا'], tanvin: ['ن', 'ا'], nim:['', ' '], hamzeh:['ا', '']} |
special_chars = [hat, tanvin, nim, hamzeh] |
out = {} |
for word in vocab: |
p_words = [word.replace(sp, sp_alt) for sp in special_chars for sp_alt in sp_mapper[sp]] |
spell_errors = [] |
p_words = list(set(p_words) - set([word])) |
for pw in p_words: |
if pw in out: |
out[pw].add(word) |
else: |
out[pw] = {word} |
out = {w: list(out[w]) for w in out} |
with open('spell_checker_mapper.json', 'w+', encoding='utf-8') as f: |
json.dump(out, f, ensure_ascii=False, indent=1) |
def create_mapper_tanvin_hamze_hat_nim_fasele(): |
mapper = {} |
hats_word = open('resources/spell/words_with_hat.txt').read().splitlines() |
nim_words = open('resources/spell/words_with_nim.txt').read().splitlines() |
tanvin_words = open('resources/spell/words_with_tanvin.txt').read().splitlines() |
hat_ch = 'آ' |
nim_fasele = '' |
for w in hats_word: |
w_without_h = w.replace(hat_ch, 'ا') |
mapper[w_without_h] = w |
for w in nim_words: |
w_without_nim = w.remove(nim_fasele) |
mapper[w_without_nim] = w |
w_space_instead_nim = w.replace(nim_fasele, ' ') |
mapper[w_space_instead_nim] = w |
def extract_lemma_nim_fasele_words(word, vocab): |
prefixs = ['اون'] |
postfixs = {'ست': 'است', 'هام':'هایم', 'ام':'ام', 'ها':'ها', 'هامون':'هایمان', 'ترین': 'ترین', 'هایشان':'هایشان'} |
tokens = word.split('') |
index = 0 |
for i in range(len(tokens)): |
index = i |
if tokens[i] not in prefixs: |
break |
for i in range(len(tokens), 0, -1): |
current_tok = ''.join(tokens[index:i]) |
if current_tok in vocab or tokens[i-1] not in postfixs: |
return current_tok |
def if_emoji(text): |
try: |
oRes = re.compile(u'([' |
u'\U0001F300-\U0001F64F' |
u'\U0001F680-\U0001F6FF' |
u'\u2600-\u26FF\u2700-\u27BF]+)', |
except re.error: |
oRes = re.compile(u'((' |
u'\ud83c[\udf00-\udfff]|' |
u'\ud83d[\udc00-\ude4f\ude80-\udeff]|' |
u'[\u2600-\u26FF\u2700-\u27BF])+)', |
return oRes.findall(text) |
def powerset(lst): |
return reduce(lambda result, x: result + [subset + [x] for subset in result], |
lst, [[]]) |