togokah
Fix tags
7c98e23
raw
history blame
13.8 kB
import string
from random import random
from random import sample
from utilities_language_general.esp_constants import nlp
from utilities_language_general.morphology import inflect
from utilities_language_general.esp_utils import check_token
from utilities_language_general.esp_constants import PHRASES
from utilities_language_general.esp_utils import fix_irregular_lemma
from utilities_language_general.esp_constants import BAD_USER_TARGET_WORDS
from utilities_language_general.esp_utils import get_distractors_from_model
class SENTENCE:
def __init__(self, original: str, n_sentence: int, max_num_distractors):
self.original = original
self.n_sentence = n_sentence
self.max_num_distractors = max_num_distractors
self.parsed = nlp(self.original)
self.sentence_lemma_pos = []
self.sentence_phrases = []
self.target_words = []
def lemmatize_sentence(self):
for token in self.parsed:
lemma_pos = f'{token.lemma_}_{token.pos_}'
if token.pos_ in ('AUX', 'VERB', 'ADJ'):
lemma_pos = fix_irregular_lemma(lemma=lemma_pos)
self.sentence_lemma_pos.append((lemma_pos, token))
def bind_phrases(self):
previous_was_phrase = False
for i in range(len(self.sentence_lemma_pos) - 1):
phrase_candidate = f'{self.sentence_lemma_pos[i][0]}_{self.sentence_lemma_pos[i + 1][0]}'
if phrase_candidate in PHRASES and not previous_was_phrase:
# phrase is {phrase: {original_token1: spacy.token, original_token2: spacy.token}}
phrase = [
f'{self.sentence_lemma_pos[i][0]}_{self.sentence_lemma_pos[i + 1][0]}',
{
'original_token1': self.sentence_lemma_pos[i][1],
'original_token2': self.sentence_lemma_pos[i + 1][1]
}
]
self.sentence_phrases.append(phrase)
previous_was_phrase = True
else:
if not previous_was_phrase:
self.sentence_phrases.append(self.sentence_lemma_pos[i][1])
previous_was_phrase = False
def search_target_words_automatically(self, model, target_minimum: set, frequency_dict: dict = None):
for token in self.sentence_phrases:
if isinstance(token, list): # if token is a phrase
original_token1 = token[1]['original_token1']
original_token2 = token[1]['original_token2']
original_token1_tags = original_token1.morph.to_dict()
original_token2_tags = original_token2.morph.to_dict()
tags = dict()
if ('haber_AUX' == f'{original_token1.lemma_}_{original_token1.pos_}'
and original_token2.pos_ in ('VERB', 'ADJ', 'AUX')):
tags['VerbForm'] = 'Compuesto'
tags['Mood'] = original_token1_tags.get('Mood')
tags['Tense'] = original_token1_tags.get('Tense')
tags['Person'] = original_token1_tags.get('Person')
tags['Number'] = original_token1_tags.get('Number')
tags['Gender'] = None
else:
tags = {**original_token1_tags, **original_token2_tags}
not_ner = True if (original_token1.ent_type == 0 and original_token2.ent_type == 0) else False
target_word = {
'sentence_number': self.n_sentence,
'sentence_text': self.original,
'original_text': f'{original_token1.text} {original_token2.text}',
'lemma': token[0],
'pos': 'phrase',
'gender': tags.get('Gender'),
'tags': tags,
'position_in_sentence': self.original.find(original_token1.text),
'not_named_entity': not_ner,
'frequency_in_text': 0
}
self.target_words.append(target_word)
else: # if token is just a spacy.nlp token
if check_token(model=model, token=token, lemma_pos='auto', current_minimum=target_minimum):
tags = token.morph.to_dict()
target_word = {
'sentence_number': self.n_sentence,
'sentence_text': self.original,
'original_text': token.text,
'lemma': token.lemma_,
'pos': token.pos_,
'gender': tags.get('Gender'),
'number_children': len([child for child in token.children]),
'tags': tags,
'position_in_sentence': self.original.find(token.text),
'not_named_entity': True if token.ent_type == 0 else False,
'frequency_in_text': frequency_dict.get(token.lemma_, 1),
}
self.target_words.append(target_word)
def search_user_target_words(self, model, user_target_words: set = None, frequency_dict: dict = None):
for _utw in user_target_words:
if _utw in self.original:
parse_utw = nlp(_utw)
if ' ' in _utw:
tags = dict()
if ('haber_AUX' == f'{parse_utw[0].lemma_}_{parse_utw[0].pos_}'
and parse_utw[1].pos_ in ('VERB', 'ADJ', 'AUX')):
tags['VerbForm'] = 'Compuesto'
tags['Mood'] = parse_utw[0].morph.to_dict().get('Mood')
tags['Tense'] = parse_utw[0].morph.to_dict().get('Tense')
tags['Person'] = parse_utw[0].morph.to_dict().get('Person')
tags['Number'] = parse_utw[0].morph.to_dict().get('Number')
tags['Gender'] = None
else:
tags = {**parse_utw[0].morph.to_dict(), **parse_utw[1].morph.to_dict()}
user_target_word_lemma = '_'.join([f'{token.lemma_}_{token.pos_}' for token in parse_utw])
user_target_word_pos = 'phrase'
user_target_word_tags = tags
not_ner = True if (parse_utw[0].ent_type == 0 and parse_utw[1].ent_type == 0) else False
else:
user_target_word_lemma = f'{parse_utw[0].lemma_}_{parse_utw[0].pos_}'
user_target_word_pos = parse_utw[0].pos_
user_target_word_tags = parse_utw[0].morph.to_dict()
not_ner = parse_utw[0].ent_type == 0
target_word = {
'sentence_number': self.n_sentence,
'sentence_text': self.original,
'original_text': _utw,
'lemma': user_target_word_lemma,
'pos': user_target_word_pos,
'gender': user_target_word_tags.get('Gender'),
'tags': user_target_word_tags,
'position_in_sentence': self.original.find(_utw),
'not_named_entity': not_ner,
'frequency_in_text': frequency_dict.get(user_target_word_lemma, 1)
}
if not (model.has_index_for(user_target_word_lemma)
or model.has_index_for(f'{user_target_word_lemma}_{user_target_word_pos}')):
BAD_USER_TARGET_WORDS.append(_utw)
else:
self.target_words.append(target_word)
def search_target_words(self, model, target_words_automatic_mode: bool, target_minimum,
user_target_words: set = None, frequency_dict: dict = None):
if target_words_automatic_mode:
self.search_target_words_automatically(model=model, target_minimum=target_minimum,
frequency_dict=frequency_dict)
else:
self.search_user_target_words(model=model, user_target_words=user_target_words,
frequency_dict=frequency_dict)
def attach_distractors_to_target_word(self, model, global_distractors, distractor_minimum, level_name,
max_frequency, progress, logs):
n_target_words = len(self.target_words)
bad_target_words = []
for i, target_word in enumerate(self.target_words):
distractors = get_distractors_from_model(model, lemma=target_word['lemma'], pos=target_word['pos'],
gender=target_word['gender'], level_name=level_name,
global_distractors=global_distractors,
distractor_minimum=distractor_minimum,
max_num_distractors=self.max_num_distractors)
if distractors is None or target_word['frequency_in_text'] > max_frequency:
target_word['distractors'] = distractors
bad_target_words.append(target_word)
target_word['distractors'] = distractors
target_word['distractors_number'] = len(distractors) if distractors is not None else 0
progress.progress(i / n_target_words)
logs.success(f'Обработали {i}/{n_target_words} слов в {self.n_sentence + 1}-м предложении')
for btw in bad_target_words:
BAD_USER_TARGET_WORDS.append(btw['original_text'])
self.target_words.remove(btw)
progress.progress(100)
logs.success(
f'Обработали {n_target_words}/{n_target_words} слов в {self.n_sentence + 1}-м предложении')
def inflect_distractors(self):
bad_target_words = []
for target_word in self.target_words:
inflected_distractors = []
for distractor_lemma, distractor_similarity in target_word['distractors']:
if distractor_lemma.count('_') > 1:
if distractor_lemma.startswith('haber_'):
distractor_lemma = distractor_lemma.split('_')[-2]
inflected = inflect(lemma=distractor_lemma, target_pos=target_word['pos'],
target_tags=target_word['tags'])
else:
continue
else:
inflected = inflect(lemma=distractor_lemma, target_pos=target_word['pos'],
target_tags=target_word['tags'])
if inflected is not None:
inflected_distractors.append(inflected)
num_distractors = min(4, self.max_num_distractors) if self.max_num_distractors >= 4 \
else self.max_num_distractors
if len(inflected_distractors) < num_distractors:
bad_target_words.append(target_word)
else:
target_word['inflected_distractors'] = inflected_distractors
for btw in bad_target_words:
BAD_USER_TARGET_WORDS.append(btw['original_text'])
self.target_words.remove(btw)
def filter_target_words(self, target_words_automatic_mode):
c_position = 0
bad_target_words = []
for target_word in self.target_words:
position_difference = 3 if target_words_automatic_mode else 0
if not (target_word['position_in_sentence'] == 0
or abs(target_word['position_in_sentence'] - c_position) >= position_difference):
bad_target_words.append(target_word)
for btw in bad_target_words:
BAD_USER_TARGET_WORDS.append(btw['original_text'])
self.target_words.remove(btw)
def sample_distractors(self, num_distractors):
for target_word in self.target_words:
len_inflected_distractors = len(target_word['inflected_distractors'])
num_distractors = min(len_inflected_distractors, num_distractors) \
if num_distractors >= 4 else num_distractors
target_word['inflected_distractors'] = sample(target_word['inflected_distractors'][:min(
len_inflected_distractors, 10)], num_distractors)
class TASK:
def __init__(self, task_data):
self.task_data = task_data
self.original_text = None
self.sentence_text = None
self.inflected_distractors = None
self.sentence_number = task_data['sentence_number']
self.position_in_sentence = task_data['position_in_sentence']
self.result = ''
self.variants = []
for key, value in task_data.items():
self.__setattr__(key, value)
def __repr__(self):
return '\n'.join([f'{key}\t=\t{value}' for key, value in self.__dict__.items()])
def compile_task(self, max_num_distractors):
len_distractors = len(self.inflected_distractors)
len_variants = min(len_distractors, max_num_distractors) if max_num_distractors > 4 \
else max_num_distractors
letters = (f'({letter})' for letter in string.ascii_lowercase[:len_variants + 1])
try:
distractors = sample(self.inflected_distractors, len_variants) + [self.original_text, ]
except ValueError:
distractors = self.inflected_distractors + [self.original_text, ]
self.variants.append(
(self.original_text, [f'{item[0]} {item[1].replace("_", " ")}'
for item in zip(letters, sorted(distractors, key=lambda _: random()))]))