a-v-bely
upd code
92aa5ff
import string
from random import random, sample
from utilities_language_general.rus_constants import nlp, PHRASES, BAD_USER_TARGET_WORDS
from utilities_language_general.rus_utils import get_tags, define_gender, make_inflection, check_token_bert, get_distractors_from_model_bert
class SENTENCE:
def __init__(self, original: str, n_sentence: int, max_num_distractors):
self.original = original
self.n_sentence = n_sentence
self.max_num_distractors = max_num_distractors
self.parsed = nlp(self.original)
self.sentence_lemma_pos = []
self.sentence_phrases = []
self.target_words = []
self.text_with_masked_task = ''
def lemmatize_sentence(self):
for token in self.parsed:
lemma_pos = f'{token.lemma_}_{token.pos_}'
self.sentence_lemma_pos.append((lemma_pos, token))
def bind_phrases(self):
self.sentence_phrases = self.parsed
def search_target_words_automatically(self, target_minimum: set, frequency_dict: dict = None, summary: list=None):
for token in self.sentence_phrases:
if isinstance(token, list): # if token is a phrase
original_token1 = token[1]['original_token1']
original_token2 = token[1]['original_token2']
original_token1_tags = get_tags(original_token1.text)[0]
original_token2_tags = get_tags(original_token2.text)[0]
tags = original_token1_tags | original_token2_tags
not_ner = True if (original_token1.ent_type == 0 and original_token2.ent_type == 0) else False
target_word = {
'masked_sentence': self.original.replace(f'{original_token1.text} {original_token2.text}',
'[MASK]'),
'sentence_number': self.n_sentence,
'sentence_text': self.original,
'original_text': f'{original_token1.text} {original_token2.text}',
'lemma': token[0],
'pos': ('phrase', [original_token1.pos_, original_token2.pos_]),
'gender': list({define_gender(original_token1), define_gender(original_token2)})[0],
'tags': tags,
'position_in_sentence': self.original.find(original_token1.text),
'not_named_entity': not_ner,
'frequency_in_text': 0,
'in_summary': self.original in summary
}
self.target_words.append(target_word)
else: # if token is just a spacy.nlp token
if check_token_bert(token=token, current_minimum=target_minimum):
target_word = {
'masked_sentence': self.original.replace(token.text, '[MASK]'),
'sentence_number': self.n_sentence,
'sentence_text': self.original,
'original_text': token.text,
'lemma': token.lemma_,
'pos': ('simple', token.pos_),
'gender': define_gender(token.lemma_),
'number_children': len([child for child in token.children]),
'tags': get_tags(token.text)[0],
'position_in_sentence': self.original.find(token.text),
'not_named_entity': True if token.ent_type == 0 else False,
'frequency_in_text': frequency_dict.get(token.lemma_, 1),
'in_summary': self.original in summary
}
self.target_words.append(target_word)
def search_user_target_words(self, user_target_words: set = None, frequency_dict: dict = None, summary: list=None):
for _utw in user_target_words:
if _utw in self.original:
parse_utw = nlp(_utw)
if ' ' in _utw:
tags = get_tags(parse_utw[0].text)[0] | get_tags(parse_utw[1].text)[0]
user_target_word_lemma = '_'.join([f'{token.lemma_}_{token.pos_}' for token in parse_utw])
user_target_word_pos = ('phrase', [token.pos_ for token in parse_utw])
user_target_word_tags = tags
not_ner = True if (parse_utw[0].ent_type == 0 and parse_utw[1].ent_type == 0) else False
else:
user_target_word_lemma = f'{parse_utw[0].lemma_}_{parse_utw[0].pos_}'
user_target_word_pos = ('simple', parse_utw[0].pos_)
user_target_word_tags = get_tags(parse_utw[0].text)[0]
not_ner = parse_utw[0].ent_type == 0
target_word = {
'masked_sentence': self.original.replace(_utw, '[MASK]'),
'sentence_number': self.n_sentence,
'sentence_text': self.original,
'original_text': _utw,
'lemma': user_target_word_lemma,
'pos': user_target_word_pos,
'gender': define_gender(parse_utw[0].text),
'tags': user_target_word_tags,
'position_in_sentence': self.original.find(_utw),
'not_named_entity': not_ner,
'frequency_in_text': frequency_dict.get(user_target_word_lemma, 1),
'in_summary': self.original in summary
}
self.target_words.append(target_word)
def search_target_words(self, target_words_automatic_mode: bool, target_minimum,
user_target_words: set = None,
frequency_dict: dict = None, summary: list=None):
if target_words_automatic_mode:
self.search_target_words_automatically(target_minimum=target_minimum,
frequency_dict=frequency_dict, summary=summary)
else:
self.search_user_target_words(user_target_words=user_target_words,
frequency_dict=frequency_dict, summary=summary)
def filter_target_words(self, target_words_automatic_mode):
c_position = 0
bad_target_words = []
for target_word in self.target_words:
position_difference = 5 if target_words_automatic_mode else 0
if not (target_word['position_in_sentence'] == 0
or abs(target_word['position_in_sentence'] - c_position) >= position_difference):
bad_target_words.append(target_word)
for btw in bad_target_words:
BAD_USER_TARGET_WORDS.append(btw['original_text'])
self.target_words.remove(btw)
class TASK:
def __init__(self, task_data, max_num_distractors):
self.task_data = task_data
self.distractors = None
self.distractors_number = 0
self.bad_target_word = False
self.inflected_distractors = None
self.pos = task_data['pos']
self.tags = task_data['tags']
self.lemma = task_data['lemma']
self.gender = task_data['gender']
self.in_summary = task_data['in_summary']
self.max_num_distractors = max_num_distractors
self.original_text = task_data['original_text']
self.sentence_text = task_data['sentence_text']
self.sentence_number = task_data['sentence_number']
self.masked_sentence = task_data['masked_sentence']
self.frequency_in_text = task_data['frequency_in_text']
self.position_in_sentence = task_data['position_in_sentence']
self.text_with_masked_task = task_data['text_with_masked_task']
self.result = ''
self.variants = []
def __repr__(self):
return '\n'.join([f'{key}\t=\t{value}' for key, value in self.__dict__.items()])
def attach_distractors_to_target_word(self, model, scaler, classifier, pos_dict,
global_distractors, distractor_minimum, level_name, max_frequency):
pos = self.pos[0] if self.pos[0] == 'phrase' else self.pos[1]
distractors_sentence = get_distractors_from_model_bert(model=model, scaler=scaler, classifier=classifier, pos_dict=pos_dict,
level_name=level_name, lemma=self.lemma, pos=pos, gender=self.gender,
text_with_masked_task=self.masked_sentence,
global_distractors=global_distractors,
distractor_minimum=distractor_minimum,
max_num_distractors=self.max_num_distractors)
if distractors_sentence is None or self.frequency_in_text > max_frequency:
self.bad_target_word = True
self.distractors = None
else:
self.distractors = [d[0] for i, d in enumerate(distractors_sentence) if i < 15]
self.distractors_number = len(distractors_sentence) if distractors_sentence is not None else 0
def inflect_distractors(self, level_name):
inflected_distractors = []
if self.distractors is None:
self.bad_target_word = True
return
for distractor_lemma in self.distractors:
inflected = make_inflection(text=distractor_lemma, pos=self.pos[1], tags=self.tags, level=level_name)
if inflected is not None:
inflected_distractors.append(inflected)
num_distractors = min(4, self.max_num_distractors) if self.max_num_distractors >= 4 \
else self.max_num_distractors
if len(inflected_distractors) < num_distractors:
self.bad_target_word = True
else:
self.distractors_number = num_distractors
self.inflected_distractors = inflected_distractors
def sample_distractors(self, num_distractors):
if not self.bad_target_word:
num_distractors = min(self.distractors_number, num_distractors) \
if num_distractors >= 4 else num_distractors
self.inflected_distractors = sample(self.inflected_distractors[:min(self.distractors_number, 10)],
num_distractors)
def compile_task(self, max_num_distractors):
len_distractors = len(self.inflected_distractors)
len_variants = min(len_distractors, max_num_distractors) if max_num_distractors > 4 \
else max_num_distractors
letters = (f'({letter})' for letter in string.ascii_lowercase[:len_variants + 1])
try:
distractors = sample(self.inflected_distractors, len_variants) + [self.original_text, ]
except ValueError:
distractors = self.inflected_distractors + [self.original_text, ]
tmp_vars = [f'{item[0]} {item[1].replace("_", " ").lower()}'.lower()
for item in zip(letters, sorted(distractors, key=lambda _: random()))]
self.variants.append((self.original_text.lower(), tmp_vars))