import pickle import math import os from .normalizer import Normalizer from .tokenizer import Tokenizer from .data_helper import DataHelper class SpellCheck: def __init__(self): self.normalizer = Normalizer() self.tokenizer = Tokenizer() self.data_helper = DataHelper() self.dir_path = os.path.dirname(os.path.realpath(__file__)) + "/" self.bigram_lm = self.data_helper.load_var(self.dir_path + "resource/spell/mybigram_lm.pckl") self.onegram_lm = self.data_helper.load_var(self.dir_path + "resource/spell/onegram.pckl") self.ingroup_chars = [{'ا', 'آ', 'ع'}, {'ت', 'ط'}, {'ث', 'س', 'ص'}, {'ح', 'ه'}, {'ذ', 'ز', 'ض', 'ظ'}, {'ق', 'غ'}] def deletion(self, word): p_list = [] for k in range(len(word)): if word[k] == '-' or word[k] == '#': continue begin = word[:k] end = word[k+1:] tmp_string = begin + end p_list.append(tmp_string) return p_list def splitting(self, word): p_list = set([]) delimator = '-' for i, char in enumerate(word): begin = word[:i].strip('\u200c') end = word[i:].strip('\u200c') tmp_string = begin + delimator + end p_list.add(tmp_string) return list(p_list) def insertion(self, word): p_list = [] alphabet = ['ا', 'آ', 'ب', 'پ', 'ت', 'ث', 'ج', 'چ', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ک', 'گ', 'ل', 'م', 'ن', 'و', 'ه', 'ی', '‌'] for k in range(len(word)+1): for char in alphabet: begin = word[:k] end = word[k:] tmp_string = begin + char + end p_list.append(tmp_string) return p_list def substitution(self, word): p_list = [] alphabet = ['ا', 'آ', 'ب', 'پ', 'ت', 'ث', 'ج', 'چ', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ک', 'گ', 'ل', 'م', 'ن', 'و', 'ه', 'ی'] for i, char in enumerate(word): if char == '-' or char == '#': continue for c in alphabet: begin = word[:i] end = word[i+1:] tmp_string = begin + c + end p_list.append(tmp_string) return p_list def transposition(self, word): p_list = [] word = list(word) tmp_word = word[:] for k1 in range(len(word)): k2 = k1 + 1 if k2 == len(word): break tmp = tmp_word[k1] tmp_word[k1] = tmp_word[k2] tmp_word[k2] = tmp tmp_string = ''.join(tmp_word) p_list.append(tmp_string) tmp_word = word[:] return p_list def build_similar_words(self, word_seq, index, zi, operation): z_list = [] o_list = [] if operation == "Spell": tmp = self.deletion(zi) for i in tmp: z_list.append(i) o_list.append("Deletion") tmp = self.insertion(zi) for i in tmp: z_list.append(i) o_list.append("Insertion") tmp = self.substitution(zi) for i in tmp: z_list.append(i) o_list.append("Substitution") tmp = self.transposition(zi) for i in tmp: z_list.append(i) o_list.append("Transposition") elif operation == "Split": tmp = self.splitting(zi) for i in tmp: z_list.append(i) o_list.append("Split") elif operation == "Merg": if index < len(word_seq)-1: tmp = zi + '#' + word_seq[index+1] z_list.append(tmp) o_list.append("Merg") return [z_list, o_list] def bigram_markov_factor(self, yi_1, yi): bigram_counts, total_count = self.bigram_lm tmp = (yi_1, yi) if tmp in bigram_counts.keys(): x = bigram_counts[tmp] x = float(x)/total_count x = math.log2(x) return x else: return -28 def get_word_probability(self, word): lex_dict = self.onegram_lm[0] total_words = self.onegram_lm[1] if word in lex_dict: count = lex_dict[word] logprob = math.log2(float(count)/total_words) return logprob else: return -50.0 def isword(self, x): if abs(x.find('#') - x.find('-')) == 1: return False dash_idx = x.find('-') if dash_idx != -1: first = x[:dash_idx] # from beginning to n (n not included) secound = x[dash_idx+1:] # n+1 through end of string if self.get_word_probability(first) < -49: return False elif self.get_word_probability(secound) < -49: return False else: return True sharp_idx = x.find('#') if sharp_idx != -1: begin = x[:sharp_idx] end = x[sharp_idx+1:] tmp_str = begin + end if self.get_word_probability(tmp_str) < -49: return False else: return True else: if self.get_word_probability(x) < -49: return False else: return True def get_possible_words(self, word_seq, index): wi = word_seq[index] possible_words = [] operation_list = [] possible_words.append(wi) operation_list.append("Nothing") if len(wi) == 1: return possible_words, operation_list '''Merg Split Spell''' [c_list, o_list] = self.build_similar_words(word_seq, index, wi, "Merg") for i, c in enumerate(c_list): if self.isword(c): possible_words.append(c) operation_list.append(o_list[i]) [c_list, o_list] = self.build_similar_words(word_seq, index, wi, "Split") for i, c in enumerate(c_list): if self.isword(c): possible_words.append(c) operation_list.append(o_list[i]) [c_list, o_list] = self.build_similar_words(word_seq, index, wi, "Spell") for i, c in enumerate(c_list): if self.isword(c): possible_words.append(c) operation_list.append(o_list[i]) return possible_words, operation_list def select_n_best(self, c_list, o_list, n=3): my_dict = {} map_dict = {} for i, word in enumerate(c_list): if o_list[i] == 'Merg': tmp_word = word.replace("#", "") prob = self.get_word_probability(tmp_word) elif o_list[i] == 'Split': begin = word.split('-')[0] end = word.split('-')[1] prob = float(self.get_word_probability(begin) + self.get_word_probability(end))/2 else: prob = self.get_word_probability(word) if word not in my_dict: my_dict[word] = prob map_dict[word] = o_list[i] n_best = set(sorted(my_dict, key=my_dict.get, reverse=True)[:n]) n_best.add(c_list[0]) n_best = list(n_best) n_best_op = [map_dict[key] for key in n_best] return n_best, n_best_op def is_ingroup_substitution(self, main_word, candidate_word): main_word = list(main_word) candidate_word = list(candidate_word) flag = False for i, c in enumerate(main_word): if c == candidate_word[i]: continue else: flag = False for l in self.ingroup_chars: if c in l and candidate_word[i] in l: flag = True break break return flag def select_correct_spell(self, candidate_list, next_candidates, next_next_candidates, prev_word, current_word): best_candidate = None best_operation = None best_score = -1000 next_next_candidate_list = [] next_next_operation_list = [] candidate_list, operation_list = candidate_list if next_candidates is not None: next_candidate_list, next_operation_list = next_candidates else: next_candidate_list, next_operation_list = [None], "Nothing" if next_next_candidates is not None: next_next_candidate_list, next_next_operation_list = next_next_candidates else: next_candidate_list, next_operation_list = [None], "Nothing" for i, candidate in enumerate(candidate_list): operation = operation_list[i] if operation == "Split": begin = candidate[:candidate.find('-')] end = candidate[candidate.find('-')+1:] candidate = begin next_word = end onegram_score = self.get_word_probability(candidate) bigram_score_with_prev = self.bigram_markov_factor(prev_word, candidate) bigram_score_next = -1000 tmp_score_next = self.bigram_markov_factor(candidate, next_word) for j, next_next_word in enumerate(next_candidate_list): opt = next_operation_list[j] if opt == 'Merg': next_next_word = next_next_word.replace("#", "") elif opt == 'Split': next_next_word = next_next_word.split('-')[0] tmp_score_next_next = self.bigram_markov_factor(next_word, next_next_word) if tmp_score_next_next > bigram_score_next: bigram_score_next = tmp_score_next_next bigram_score_next = float(bigram_score_next + tmp_score_next)/2 elif operation == "Merg": begin = candidate[:candidate.find('#')] end = candidate[candidate.find('#')+1:] candidate = begin + end onegram_score = self.get_word_probability(candidate) bigram_score_with_prev = self.bigram_markov_factor(prev_word, candidate) bigram_score_next = -1000 for j, next_next_word in enumerate(next_next_candidate_list): opt = next_next_operation_list[j] if opt == 'Merg': next_next_word = next_next_word.replace("#", "") elif opt == 'Split': next_next_word = next_next_word.split('-')[0] tmp_score = self.bigram_markov_factor(candidate, next_next_word) if tmp_score > bigram_score_next: bigram_score_next = tmp_score else: onegram_score = self.get_word_probability(candidate) bigram_score_with_prev = self.bigram_markov_factor(prev_word, candidate) bigram_score_next = -1000 for j, next_word in enumerate(next_candidate_list): opt = next_operation_list[j] if opt == 'Merg': next_word = next_word.replace("#", "") elif opt == 'Split': next_word = next_word.split('-')[0] tmp_score = self.bigram_markov_factor(candidate, next_word) if tmp_score > bigram_score_next: bigram_score_next = tmp_score if operation == 'Substitution': if self.is_ingroup_substitution(current_word, candidate): onegram_score += 20 else: onegram_score += 10 elif operation == 'Deletion' or operation == 'Insertion': onegram_score += 5 if '\u200c' in candidate and '\u200c' not in current_word: onegram_score += 5 elif operation == 'Split' or operation == 'Merg': onegram_score += 7 elif operation == 'Nothing': onegram_score += 20 score = 1*onegram_score + 0.7*bigram_score_with_prev + 0.7*bigram_score_next if score > best_score: best_operation = operation best_candidate = candidate_list[i] best_score = score return best_candidate, best_operation def spell_corrector(self, doc_string): words = self.tokenizer.tokenize_words(self.normalizer.normalize(doc_string)) best_o_list = [] best_candidates_list = [] yi_1 = None merged_before = False suggest_list = [] for i, word in enumerate(words): [c_list, o_list] = self.get_possible_words(words, i) n_best = self.select_n_best(c_list, o_list, n=15) suggest_list.append(n_best) for i, candidate_list in enumerate(suggest_list): if merged_before: continue if (i+2) < len(suggest_list): next_candidates = suggest_list[i+1] next_next_candidates = suggest_list[i+2] elif (i+1) < len(suggest_list): next_candidates = suggest_list[i+1] next_next_candidates = None else: next_candidates = None next_next_candidates = None best_candidate, best_operation = self.select_correct_spell(candidate_list, next_candidates, next_next_candidates, yi_1, words[i]) merged_before = False if best_operation == "Split": begin = best_candidate.split('-')[0] end = best_candidate.split('-')[1] best_candidate = [begin, end] if best_operation == "Merg": best_candidate = best_operation.replace("#", "") merged_before = True if type(best_candidate) == str: best_candidate = [best_candidate] best_o_list.append(best_operation) best_candidates_list.extend(best_candidate) yi_1 = best_candidate[-1] res = " ".join(best_candidates_list) ops = " ".join(best_o_list) return res if __name__ == "__main__": doc_string = "نمازگذاران وارد مسلی شدند." myspell_checker = SpellCheck() res = myspell_checker.spell_corrector(doc_string) print(res)