from nltk import edit_distance from utilities.utils import answer_letter from utilities_language_general.rus_constants import nlp from utilities_language_general.rus_constants import morph from utilities_language_general.rus_constants import stop_list from utilities_language_general.rus_constants import SIMILARITY_VALUES_w2v from utilities_language_general.rus_constants import SIMILARITY_VALUES_bert def prepare_target_words(target_words): target_words = target_words.lower().replace(' ,', ',').replace(',', ', ').replace(' ', ' ').split(', ') TARGET_WORDS = set() for target_word in target_words: paradigm = {form.word for form in morph.parse(target_word)[0].lexeme} TARGET_WORDS = TARGET_WORDS.union(paradigm) return list(set(TARGET_WORDS)) def compute_frequency_dict(text: str) -> dict: """ Compute frequency dictionary of given text and return it sorted in descending order. :param text: given text as string variable :return: frequency dictionary {word: frequency} sorted in descending order """ freq_dict = {} doc = nlp(text) lemma_list_spacy = [token.lemma_ for token in doc] for lemma in lemma_list_spacy: if lemma.isalpha(): if lemma not in freq_dict.keys(): freq_dict[lemma] = 1 else: freq_dict[lemma] += 1 lemma_list_morph = [morph.parse(token.text)[0].normal_form for token in doc] for lemma in lemma_list_morph: if lemma.isalpha() and (lemma not in lemma_list_spacy): if lemma not in freq_dict.keys(): freq_dict[lemma] = 1 else: freq_dict[lemma] += 1 return freq_dict def convert_gender(gender_spacy): genders = {'Masc': 'masc', 'Fem': 'femn', 'Neut': 'neut'} return genders[gender_spacy] def define_gender(token: str) -> str or None: """ Prettify defining gender of a token. :param token: the word for which it's needed to identify gender :return: gender as string variable, ('masc', 'femn', 'neut') """ token = str(token) return morph.parse(token)[0].tag.gender def get_tags(token: str): """ Prettify getting tags of a word and method of obtaining them. If pymorpy2 is too unsure how to parse the word (>3 parses) None is returned. :param token: the word for which it's needed to identify tags and method :return: tags to be used for synthesis as set variable and method as string or None """ parse = morph.parse(token) tags = str(parse[0].tag) parts = (tag for part in tags.split(' ') for tag in part.split(',')) method = str(parse[0].methods_stack[0][0]) return set(parts), method def make_inflection(text: str, pos: str or list, tags: set) -> str or None: if isinstance(pos, list): if set(pos).issubset({'NOUN', 'ADJ', 'PROPN'}): noun_adjective_phrase_tags = {'nomn', 'gent', 'datv', 'accs', 'ablt', 'loct', 'voct', 'gen2', 'acc2', 'loc2', 'sing', 'plur'} tags = tags.intersection(noun_adjective_phrase_tags) lemmas = text.split('_') word_form = '' # Parse phrase and define phrase gender, animacy gender = None animacy = None parsed = [] for lemma in lemmas: parse = morph.parse(lemma)[0] gender = gender if gender is not None else parse.tag.gender if parse.tag.POS == 'NOUN' else None animacy = animacy if animacy is not None else str(parse.tag).split(',')[1] \ if parse.tag.POS == 'NOUN' else None parsed.append(parse) tags = tags.union({gender, animacy}) tags.discard(None) try: for parse in parsed: word_form += f'{parse.inflect(tags).word} ' return word_form except AttributeError: return None else: word_form = morph.parse(text)[0].inflect(tags) return word_form.word if word_form is not None else None def check_token(token, lemma_pos, model, current_minimum: set = None, stop_words=stop_list, check_allowed_pos: set = None, check_allowed_dep: set = None) -> bool: not_allowed_pos = {'PROPN', 'PUNCT', 'NUM'} not_allowed_synt_dep = {'cop', } # 'ROOT' if lemma_pos == 'auto': lemma_pos = f'{token.lemma_}_{token.pos_}' if not token.text.isalpha(): return False if current_minimum is not None and token.lemma_ not in current_minimum: return False if not model.has_index_for(lemma_pos): return False if get_tags(token.text) is not None: tags, method = get_tags(token.text) else: tags = None method = 'suspense' method_satisfied = (method == 'DictionaryAnalyzer()') if (token.text not in stop_words and not token.is_stop and tags is not None and method_satisfied): if check_allowed_pos is None and check_allowed_dep is None: if token.pos_ not in not_allowed_pos and token.dep_ not in not_allowed_synt_dep: return True return False elif check_allowed_pos is not None and check_allowed_dep is None: if token.pos_ in check_allowed_pos and token.dep_ not in not_allowed_synt_dep: return True return False elif check_allowed_pos is None and check_allowed_dep is not None: if token.pos_ not in not_allowed_pos and token.dep_ in check_allowed_dep: return True return False else: if token.pos_ in check_allowed_pos and token.dep_ in check_allowed_dep: return True return False else: return False def check_token_bert(token, current_minimum: set = None, stop_words=stop_list, check_allowed_pos: set = None, check_allowed_dep: set = None) -> bool: not_allowed_pos = {'PROPN', 'PUNCT', 'NUM'} not_allowed_synt_dep = {'cop', } # 'ROOT' if not token.text.isalpha(): return False if current_minimum is not None and token.lemma_ not in current_minimum: return False if get_tags(token.text) is not None: tags, method = get_tags(token.text) else: tags = None method = 'suspense' method_satisfied = (method == 'DictionaryAnalyzer()') if (token.text not in stop_words and not token.is_stop and tags is not None and method_satisfied): if check_allowed_pos is None and check_allowed_dep is None: if token.pos_ not in not_allowed_pos and token.dep_ not in not_allowed_synt_dep: return True return False elif check_allowed_pos is not None and check_allowed_dep is None: if token.pos_ in check_allowed_pos and token.dep_ not in not_allowed_synt_dep: return True return False elif check_allowed_pos is None and check_allowed_dep is not None: if token.pos_ not in not_allowed_pos and token.dep_ in check_allowed_dep: return True return False else: if token.pos_ in check_allowed_pos and token.dep_ in check_allowed_dep: return True return False else: return False def get_distractors_from_model(model, lemma: str, pos: str, gender: str or None, global_distractors: set, distractor_minimum: set, level_name: str, max_num_distractors: int, max_length_ratio=5, min_edit_distance_ratio=0.5): distractors = [] query = lemma if '_' in lemma else f'{lemma}_{pos}' lemma = '_'.join(lemma.split('_')[::2]) if model.has_index_for(query): candidates = model.most_similar(query, topn=max_num_distractors + 100) else: if query.count('_') == 1: return None query_raw_list = query.split('_') query_parts = ['_'.join(query_raw_list[i:i + 2]) for i in range(len(query_raw_list))][::2] query_vector = model.get_mean_vector(query_parts) candidates = model.similar_by_vector(query_vector, topn=max_num_distractors + 100) for candidate in candidates: if candidate[0].count('_') == 1: distractor_lemma, distractor_pos = candidate[0].split('_') distractor_similarity = candidate[1] candidate_gender = define_gender(distractor_lemma) length_ratio = abs(len(lemma) - len(distractor_lemma)) condition = ((distractor_pos == pos or (distractor_pos in ('VERB', 'ADJ', 'phrase') and pos in ('VERB', 'ADJ', 'phrase'))) and distractor_lemma != lemma and len(distractors) < 100 and distractor_similarity < SIMILARITY_VALUES_w2v[level_name] and candidate_gender == gender and length_ratio <= max_length_ratio and distractor_lemma not in global_distractors and edit_distance(lemma, distractor_lemma) / ((len(lemma) + len(distractor_lemma)) / 2) > min_edit_distance_ratio) if condition: if distractor_minimum is not None: if distractor_lemma in distractor_minimum: distractors.append((distractor_lemma, distractor_similarity)) global_distractors.add(distractor_lemma) else: distractors.append((distractor_lemma, distractor_similarity)) global_distractors.add(distractor_lemma) else: if candidate[0].count('_') > 3 or pos in ('NOUN', 'ADJ', 'NUM'): continue d1_lemma, d1_pos, d2_lemma, d2_pos = candidate[0].split('_') distractor_lemma = f'{d1_lemma}_{d2_lemma}' distractor_similarity = candidate[1] condition = (((d1_pos == pos or d2_pos == pos) or (d1_pos in ('VERB', 'AUX', 'SCONJ', 'ADP') and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP')) or (d2_pos in ('VERB', 'AUX', 'SCONJ', 'ADP') and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP'))) and candidate[0] != lemma and distractor_lemma != lemma and len(distractors) < 100 and distractor_similarity < SIMILARITY_VALUES_w2v[level_name] and distractor_lemma not in global_distractors) if condition: if distractor_minimum is not None: if (distractor_lemma in distractor_minimum or (d1_lemma in distractor_minimum and d2_lemma in distractor_minimum)): distractors.append((candidate[0], distractor_similarity)) global_distractors.add(distractor_lemma) else: distractors.append((candidate[0], distractor_similarity)) global_distractors.add(distractor_lemma) max_num_distractors = min(4, max_num_distractors) if max_num_distractors >= 4 else max_num_distractors if len(distractors) >= max_num_distractors: return distractors else: return None def get_distractors_from_model_bert(model, text_with_masked_task: str, lemma: str, pos: str, gender: str or None, global_distractors: set, distractor_minimum: set, level_name: str, max_num_distractors: int, max_length_ratio=5, min_edit_distance_ratio=0.5): _distractors = [] try: if distractor_minimum: bert_candidates = [token for token in model(text_with_masked_task, top_k=max_num_distractors + 100)] else: bert_candidates = [token for token in model(text_with_masked_task, top_k=max_num_distractors + 100)] inflected_candidates = [] for candidate in bert_candidates: if isinstance(candidate, list): bert_candidates = candidate continue if candidate['token_str'].isalpha(): candidate_morph = nlp(candidate['token_str'])[0] inflected_candidates.append((f"{candidate_morph.lemma_}_{candidate_morph.pos_}", candidate['score'])) except KeyError: return None for candidate_distractor in inflected_candidates: if '_' in candidate_distractor[0]: distractor_lemma, distractor_pos = candidate_distractor[0].split('_') else: candidate_morph = nlp(candidate_distractor[0])[0] distractor_lemma, distractor_pos = candidate_morph.lemma_, candidate_morph.pos_ distractor_similarity = candidate_distractor[1] candidate_gender = define_gender(distractor_lemma) length_ratio = abs(len(lemma) - len(distractor_lemma)) if (((distractor_pos == pos) or (pos in ('VERB', 'ADJ', 'phrase') and distractor_pos in ('VERB', 'ADJ', 'phrase'))) and distractor_lemma != lemma and (len(_distractors) < max_num_distractors + 10) and (distractor_similarity < SIMILARITY_VALUES_bert[level_name]) and (candidate_gender == gender) and (length_ratio <= max_length_ratio) # May be changed if case of phrases and (distractor_lemma not in global_distractors) and (edit_distance(lemma, distractor_lemma) # May be changed if case of phrases / ((len(lemma) + len(distractor_lemma)) / 2) > min_edit_distance_ratio)): if distractor_minimum is not None: if distractor_lemma in distractor_minimum: _distractors.append((distractor_lemma, candidate_distractor[1])) global_distractors.add(distractor_lemma) else: _distractors.append((distractor_lemma, candidate_distractor[1])) num_distractors = min(4, max_num_distractors) if max_num_distractors >= 4 else max_num_distractors if len(_distractors) < num_distractors: return None return _distractors def prepare_tasks(input_variants): TASKS_STUDENT = '' TASKS_TEACHER = '' KEYS_ONLY = '' RAW_TASKS = [] RAW_KEYS_ONLY = [] RESULT_TASKS_STUDENT = [] TASKS_WITH_ANSWERS_L = [] KEYS = [] for num, item in enumerate(input_variants): item = item[0] answer = item[0].lower() variants = '\t'.join([i.lower() for i in item[1]]) current_answer_letter = answer_letter(answer=answer, variants=[i.lower() for i in item[1]]) RAW_TASKS.append((num + 1, variants)) RAW_KEYS_ONLY.append((num + 1, current_answer_letter.split(' ')[0])) RESULT_TASKS_STUDENT.append(f"{num + 1}.\t{variants}") TASKS_WITH_ANSWERS_L.append(f"{num + 1}.\t" f"Ответ: {current_answer_letter}\n\t" f"Варианты: {variants}") KEYS.append(f"{num + 1}.\tОтвет: {current_answer_letter}") for task in RESULT_TASKS_STUDENT: TASKS_STUDENT += f'{task}\n' for task in TASKS_WITH_ANSWERS_L: TASKS_TEACHER += f'{task}\n' for task in KEYS: KEYS_ONLY += f'{task}\n' return {'TASKS_STUDENT': TASKS_STUDENT, 'TASKS_TEACHER': TASKS_TEACHER, 'KEYS_ONLY': KEYS_ONLY, 'RAW_TASKS': RAW_TASKS, 'RAW_KEYS_ONLY': RAW_KEYS_ONLY}