Mongolian-spell-correction / worddistance.py
Erkadaflown's picture
Mongolian text correction
fc08cd7
import textdistance
import pickle
correction_dict = {}
with open('correct.txt', 'r') as correct_file, open('incorrect.txt', 'r') as incorrect_file:
correct_lines = correct_file.readlines()
incorrect_lines = incorrect_file.readlines()
for correct_line, incorrect_line in zip(correct_lines, incorrect_lines):
correct_words = correct_line.strip().split()
incorrect_words = incorrect_line.strip().split()
for incorrect_word, correct_word in zip(incorrect_words, correct_words):
correction_dict[incorrect_word] = correct_word
def predict_correction(incorrect_word, correction_dict):
if incorrect_word in correction_dict:
return correction_dict[incorrect_word], 0
else:
best_match = None
best_distance = float('inf')
for correct_word in correction_dict.keys():
distance = textdistance.levenshtein(incorrect_word, correct_word)
if distance < best_distance:
best_distance = distance
best_match = correct_word
return best_match, best_distance
def correct_text(input_text, correction_dict):
corrected_text = []
words = input_text.split()
for word in words:
best_correction, distance = predict_correction(word, correction_dict)
if best_correction:
corrected_text.append(best_correction)
else:
corrected_text.append(word)
return ' '.join(corrected_text)
with open('correction_model.pkl', 'wb') as model_file:
pickle.dump(correction_dict, model_file)