import copy import string from random import random from random import sample from utilities_language_general.rus_constants import nlp from utilities_language_general.rus_utils import get_tags from utilities_language_general.rus_utils import check_token from utilities_language_general.rus_constants import PHRASES from utilities_language_general.rus_utils import define_gender from utilities_language_general.rus_utils import convert_gender from utilities_language_general.rus_utils import make_inflection from utilities_language_general.rus_constants import BAD_USER_TARGET_WORDS from utilities_language_general.rus_utils import get_distractors_from_model class SENTENCE: def __init__(self, original: str, n_sentence: int, max_num_distractors): self.original = original self.n_sentence = n_sentence self.max_num_distractors = max_num_distractors self.parsed = nlp(self.original) self.sentence_lemma_pos = [] self.sentence_phrases = [] self.target_words = [] def lemmatize_sentence(self): for token in self.parsed: lemma_pos = f'{token.lemma_}_{token.pos_}' self.sentence_lemma_pos.append((lemma_pos, token)) def bind_phrases(self): previous_was_phrase = False for i in range(len(self.sentence_lemma_pos) - 1): phrase_candidate = f'{self.sentence_lemma_pos[i][0]}_{self.sentence_lemma_pos[i + 1][0]}' if phrase_candidate in PHRASES and not previous_was_phrase: # phrase is {phrase: {original_token1: spacy.token, original_token2: spacy.token}} phrase = [ f'{self.sentence_lemma_pos[i][0]}_{self.sentence_lemma_pos[i + 1][0]}', { 'original_token1': self.sentence_lemma_pos[i][1], 'original_token2': self.sentence_lemma_pos[i + 1][1] } ] self.sentence_phrases.append(phrase) previous_was_phrase = True else: if not previous_was_phrase: self.sentence_phrases.append(self.sentence_lemma_pos[i][1]) previous_was_phrase = False def search_target_words_automatically(self, model, target_minimum: set, frequency_dict: dict = None): for token in self.sentence_phrases: # TODO: Still do not have w2v model with phrases # therefore cannot come up with the criteria if isinstance(token, list): # if token is a phrase original_token1 = token[1]['original_token1'] original_token2 = token[1]['original_token2'] original_token1_tags = get_tags(original_token1.text)[0] original_token2_tags = get_tags(original_token2.text)[0] tags = original_token1_tags | original_token2_tags not_ner = True if (original_token1.ent_type == 0 and original_token2.ent_type == 0) else False target_word = { 'sentence_number': self.n_sentence, 'sentence_text': self.original, 'original_text': f'{original_token1.text} {original_token2.text}', 'lemma': token[0], 'pos': ('phrase', [original_token1.pos_, original_token2.pos_]), 'gender': list({define_gender(original_token1), define_gender(original_token2)})[0], 'tags': tags, 'position_in_sentence': self.original.find(original_token1.text), 'not_named_entity': not_ner, 'frequency_in_text': 0 } self.target_words.append(target_word) else: # if token is just a spacy.nlp token if check_token(model=model, token=token, lemma_pos='auto', current_minimum=target_minimum): target_word = { 'sentence_number': self.n_sentence, 'sentence_text': self.original, 'original_text': token.text, 'lemma': token.lemma_, 'pos': ('simple', token.pos_), 'gender': define_gender(token.lemma_), 'number_children': len([child for child in token.children]), 'tags': get_tags(token.text)[0], 'position_in_sentence': self.original.find(token.text), 'not_named_entity': True if token.ent_type == 0 else False, 'frequency_in_text': frequency_dict.get(token.lemma_, 1), } self.target_words.append(target_word) def search_user_target_words(self, model, user_target_words: set = None, frequency_dict: dict = None): for _utw in user_target_words: if _utw in self.original: parse_utw = nlp(_utw) if ' ' in _utw: tags = get_tags(parse_utw[0].text)[0] | get_tags(parse_utw[1].text)[0] user_target_word_lemma = '_'.join([f'{token.lemma_}_{token.pos_}' for token in parse_utw]) user_target_word_pos = ('phrase', [token.pos_ for token in parse_utw]) user_target_word_tags = tags not_ner = True if (parse_utw[0].ent_type == 0 and parse_utw[1].ent_type == 0) else False else: user_target_word_lemma = f'{parse_utw[0].lemma_}_{parse_utw[0].pos_}' user_target_word_pos = ('simple', parse_utw[0].pos_) user_target_word_tags = get_tags(parse_utw[0].text)[0] not_ner = parse_utw[0].ent_type == 0 target_word = { 'sentence_number': self.n_sentence, 'sentence_text': self.original, 'original_text': _utw, 'lemma': user_target_word_lemma, 'pos': user_target_word_pos, 'gender': convert_gender(user_target_word_tags.get('Gender')), 'tags': user_target_word_tags, 'position_in_sentence': self.original.find(_utw), 'not_named_entity': not_ner, 'frequency_in_text': frequency_dict.get(user_target_word_lemma, 1) } if not (model.has_index_for(user_target_word_lemma) or model.has_index_for(f'{user_target_word_lemma}_{user_target_word_pos[1]}')): BAD_USER_TARGET_WORDS.append(_utw) else: self.target_words.append(target_word) def search_target_words(self, model, target_words_automatic_mode: bool, target_minimum, user_target_words: set = None, frequency_dict: dict = None): if target_words_automatic_mode: self.search_target_words_automatically(model=model, target_minimum=target_minimum, frequency_dict=frequency_dict) else: self.search_user_target_words(model=model, user_target_words=user_target_words, frequency_dict=frequency_dict) def attach_distractors_to_target_word(self, model, global_distractors, distractor_minimum, level_name, max_frequency, progress, logs): n_target_words = len(self.target_words) bad_target_words = [] for i, target_word in enumerate(self.target_words): pos = target_word['pos'][0] if target_word['pos'][0] == 'phrase' else target_word['pos'][1] distractors = get_distractors_from_model(model, lemma=target_word['lemma'], pos=pos, gender=target_word['gender'], level_name=level_name, global_distractors=global_distractors, distractor_minimum=distractor_minimum, max_num_distractors=self.max_num_distractors) if distractors is None or target_word['frequency_in_text'] > max_frequency: target_word['distractors'] = distractors bad_target_words.append(target_word) target_word['distractors'] = distractors target_word['distractors_number'] = len(distractors) if distractors is not None else 0 progress.progress(i / n_target_words) logs.success(f'Обработали {i}/{n_target_words} слов в {self.n_sentence + 1}-м предложении') for btw in bad_target_words: BAD_USER_TARGET_WORDS.append(btw['original_text']) self.target_words.remove(btw) progress.progress(100) logs.success( f'Обработали {n_target_words}/{n_target_words} слов в {self.n_sentence + 1}-м предложении') def inflect_distractors(self): bad_target_words = [] for target_word in self.target_words: inflected_distractors = [] for distractor_lemma, distractor_similarity in target_word['distractors']: if distractor_lemma.count('_') > 1: # TODO The same. Has to train model and test this code inflected = make_inflection(text=distractor_lemma, pos=target_word['pos'][1], tags=target_word['tags']) else: inflected = make_inflection(text=distractor_lemma, pos=target_word['pos'][1], tags=target_word['tags']) if inflected is not None: inflected_distractors.append(inflected) else: new_tags = copy.deepcopy(target_word['tags']) if 'NOUN' in target_word['tags'] and 'inan' in target_word['tags']: new_tags.discard('inan') new_tags.add('anim') elif 'NOUN' in target_word['tags'] and 'anim' in target_word['tags']: new_tags.discard('anim') new_tags.add('inan') inflected = make_inflection(text=distractor_lemma, pos=target_word['pos'][1], tags=new_tags) if inflected is not None: inflected_distractors.append(inflected) num_distractors = min(4, self.max_num_distractors) if self.max_num_distractors >= 4 \ else self.max_num_distractors if len(inflected_distractors) < num_distractors: bad_target_words.append(target_word) else: target_word['inflected_distractors'] = inflected_distractors for btw in bad_target_words: BAD_USER_TARGET_WORDS.append(btw['original_text']) self.target_words.remove(btw) def filter_target_words(self, target_words_automatic_mode): c_position = 0 bad_target_words = [] for target_word in self.target_words: position_difference = 3 if target_words_automatic_mode else 0 if not (target_word['position_in_sentence'] == 0 or abs(target_word['position_in_sentence'] - c_position) >= position_difference): bad_target_words.append(target_word) for btw in bad_target_words: BAD_USER_TARGET_WORDS.append(btw['original_text']) self.target_words.remove(btw) def sample_distractors(self, num_distractors): for target_word in self.target_words: len_inflected_distractors = len(target_word['inflected_distractors']) num_distractors = min(len_inflected_distractors, num_distractors) \ if num_distractors >= 4 else num_distractors target_word['inflected_distractors'] = sample(target_word['inflected_distractors'][:min( len_inflected_distractors, 10)], num_distractors) class TASK: def __init__(self, task_data): self.task_data = task_data self.original_text = None self.sentence_text = None self.inflected_distractors = None self.sentence_number = task_data['sentence_number'] self.position_in_sentence = task_data['position_in_sentence'] self.result = '' self.variants = [] for key, value in task_data.items(): self.__setattr__(key, value) def __repr__(self): return '\n'.join([f'{key}\t=\t{value}' for key, value in self.__dict__.items()]) def compile_task(self, max_num_distractors): len_distractors = len(self.inflected_distractors) len_variants = min(len_distractors, max_num_distractors) if max_num_distractors > 4 \ else max_num_distractors letters = (f'({letter})' for letter in string.ascii_lowercase[:len_variants + 1]) try: distractors = sample(self.inflected_distractors, len_variants) + [self.original_text, ] except ValueError: distractors = self.inflected_distractors + [self.original_text, ] self.variants.append( (self.original_text, [f'{item[0]} {item[1].replace("_", " ")}' for item in zip(letters, sorted(distractors, key=lambda _: random()))]))