import textdistance import pickle correction_dict = {} with open('correct.txt', 'r') as correct_file, open('incorrect.txt', 'r') as incorrect_file: correct_lines = correct_file.readlines() incorrect_lines = incorrect_file.readlines() for correct_line, incorrect_line in zip(correct_lines, incorrect_lines): correct_words = correct_line.strip().split() incorrect_words = incorrect_line.strip().split() for incorrect_word, correct_word in zip(incorrect_words, correct_words): correction_dict[incorrect_word] = correct_word def predict_correction(incorrect_word, correction_dict): if incorrect_word in correction_dict: return correction_dict[incorrect_word], 0 else: best_match = None best_distance = float('inf') for correct_word in correction_dict.keys(): distance = textdistance.levenshtein(incorrect_word, correct_word) if distance < best_distance: best_distance = distance best_match = correct_word return best_match, best_distance def correct_text(input_text, correction_dict): corrected_text = [] words = input_text.split() for word in words: best_correction, distance = predict_correction(word, correction_dict) if best_correction: corrected_text.append(best_correction) else: corrected_text.append(word) return ' '.join(corrected_text) with open('correction_model.pkl', 'wb') as model_file: pickle.dump(correction_dict, model_file)