Spaces:

a-v-bely
/

russian-task-generator

Running

File size: 15,682 Bytes

from nltk import edit_distance
from utilities.utils import answer_letter
from utilities_language_general.rus_constants import nlp
from utilities_language_general.rus_constants import morph
from utilities_language_general.rus_constants import stop_list
from utilities_language_general.rus_constants import SIMILARITY_VALUES_w2v
from utilities_language_general.rus_constants import SIMILARITY_VALUES_bert


def prepare_target_words(target_words):
    target_words = target_words.lower().replace(' ,', ',').replace(',', ', ').replace('  ', ' ').split(', ')
    TARGET_WORDS = set()
    for target_word in target_words:
        paradigm = {form.word for form in morph.parse(target_word)[0].lexeme}
        TARGET_WORDS = TARGET_WORDS.union(paradigm)
    return list(set(TARGET_WORDS))


def compute_frequency_dict(text: str) -> dict:
    """
    Compute frequency dictionary of given text and return it sorted in descending order.

    :param text: given text as string variable
    :return: frequency dictionary {word: frequency} sorted in descending order
    """
    freq_dict = {}
    doc = nlp(text)
    lemma_list_spacy = [token.lemma_ for token in doc]
    for lemma in lemma_list_spacy:
        if lemma.isalpha():
            if lemma not in freq_dict.keys():
                freq_dict[lemma] = 1
            else:
                freq_dict[lemma] += 1
    lemma_list_morph = [morph.parse(token.text)[0].normal_form for token in doc]
    for lemma in lemma_list_morph:
        if lemma.isalpha() and (lemma not in lemma_list_spacy):
            if lemma not in freq_dict.keys():
                freq_dict[lemma] = 1
            else:
                freq_dict[lemma] += 1
    return freq_dict


def convert_gender(gender_spacy):
    genders = {'Masc': 'masc', 'Fem': 'femn', 'Neut': 'neut'}
    return genders[gender_spacy]


def define_gender(token: str) -> str or None:
    """
    Prettify defining gender of a token.

    :param token: the word for which it's needed to identify gender
    :return: gender as string variable, ('masc', 'femn', 'neut')
    """
    token = str(token)
    return morph.parse(token)[0].tag.gender


def get_tags(token: str):
    """
    Prettify getting tags of a word and method of obtaining them.
    If pymorpy2 is too unsure how to parse the word (>3 parses) None is returned.

    :param token: the word for which it's needed to identify tags and method
    :return: tags to be used for synthesis as set variable and method as string or None
    """
    parse = morph.parse(token)
    tags = str(parse[0].tag)
    parts = (tag for part in tags.split(' ') for tag in part.split(','))
    method = str(parse[0].methods_stack[0][0])
    return set(parts), method


def make_inflection(text: str, pos: str or list, tags: set) -> str or None:
    if isinstance(pos, list):
        if set(pos).issubset({'NOUN', 'ADJ', 'PROPN'}):
            noun_adjective_phrase_tags = {'nomn', 'gent', 'datv', 'accs', 'ablt', 'loct', 'voct',
                                          'gen2', 'acc2', 'loc2', 'sing', 'plur'}
            tags = tags.intersection(noun_adjective_phrase_tags)
        lemmas = text.split('_')
        word_form = ''
        # Parse phrase and define phrase gender, animacy
        gender = None
        animacy = None
        parsed = []
        for lemma in lemmas:
            parse = morph.parse(lemma)[0]
            gender = gender if gender is not None else parse.tag.gender if parse.tag.POS == 'NOUN' else None
            animacy = animacy if animacy is not None else str(parse.tag).split(',')[1] \
                if parse.tag.POS == 'NOUN' else None
            parsed.append(parse)
        tags = tags.union({gender, animacy})
        tags.discard(None)
        try:
            for parse in parsed:
                word_form += f'{parse.inflect(tags).word} '
            return word_form
        except AttributeError:
            return None
    else:
        word_form = morph.parse(text)[0].inflect(tags)
        return word_form.word if word_form is not None else None


def check_token(token, lemma_pos, model, current_minimum: set = None, stop_words=stop_list,
                check_allowed_pos: set = None, check_allowed_dep: set = None) -> bool:
    not_allowed_pos = {'PROPN', 'PUNCT', 'NUM'}
    not_allowed_synt_dep = {'cop', }  # 'ROOT'
    if lemma_pos == 'auto':
        lemma_pos = f'{token.lemma_}_{token.pos_}'
    if not token.text.isalpha():
        return False
    if current_minimum is not None and token.lemma_ not in current_minimum:
        return False
    if not model.has_index_for(lemma_pos):
        return False
    if get_tags(token.text) is not None:
        tags, method = get_tags(token.text)
    else:
        tags = None
        method = 'suspense'
    method_satisfied = (method == 'DictionaryAnalyzer()')
    if (token.text not in stop_words
            and not token.is_stop
            and tags is not None
            and method_satisfied):
        if check_allowed_pos is None and check_allowed_dep is None:
            if token.pos_ not in not_allowed_pos and token.dep_ not in not_allowed_synt_dep:
                return True
            return False
        elif check_allowed_pos is not None and check_allowed_dep is None:
            if token.pos_ in check_allowed_pos and token.dep_ not in not_allowed_synt_dep:
                return True
            return False
        elif check_allowed_pos is None and check_allowed_dep is not None:
            if token.pos_ not in not_allowed_pos and token.dep_ in check_allowed_dep:
                return True
            return False
        else:
            if token.pos_ in check_allowed_pos and token.dep_ in check_allowed_dep:
                return True
            return False
    else:
        return False


def check_token_bert(token, current_minimum: set = None, stop_words=stop_list,
                     check_allowed_pos: set = None, check_allowed_dep: set = None) -> bool:
    not_allowed_pos = {'PROPN', 'PUNCT', 'NUM'}
    not_allowed_synt_dep = {'cop', }  # 'ROOT'
    if not token.text.isalpha():
        return False
    if current_minimum is not None and token.lemma_ not in current_minimum:
        return False
    if get_tags(token.text) is not None:
        tags, method = get_tags(token.text)
    else:
        tags = None
        method = 'suspense'
    method_satisfied = (method == 'DictionaryAnalyzer()')
    if (token.text not in stop_words
            and not token.is_stop
            and tags is not None
            and method_satisfied):
        if check_allowed_pos is None and check_allowed_dep is None:
            if token.pos_ not in not_allowed_pos and token.dep_ not in not_allowed_synt_dep:
                return True
            return False
        elif check_allowed_pos is not None and check_allowed_dep is None:
            if token.pos_ in check_allowed_pos and token.dep_ not in not_allowed_synt_dep:
                return True
            return False
        elif check_allowed_pos is None and check_allowed_dep is not None:
            if token.pos_ not in not_allowed_pos and token.dep_ in check_allowed_dep:
                return True
            return False
        else:
            if token.pos_ in check_allowed_pos and token.dep_ in check_allowed_dep:
                return True
            return False
    else:
        return False


def get_distractors_from_model(model, lemma: str, pos: str, gender: str or None, global_distractors: set,
                               distractor_minimum: set, level_name: str, max_num_distractors: int,
                               max_length_ratio=5, min_edit_distance_ratio=0.5):
    distractors = []
    query = lemma if '_' in lemma else f'{lemma}_{pos}'
    lemma = '_'.join(lemma.split('_')[::2])
    if model.has_index_for(query):
        candidates = model.most_similar(query, topn=max_num_distractors + 100)
    else:
        if query.count('_') == 1:
            return None
        query_raw_list = query.split('_')
        query_parts = ['_'.join(query_raw_list[i:i + 2]) for i in range(len(query_raw_list))][::2]
        query_vector = model.get_mean_vector(query_parts)
        candidates = model.similar_by_vector(query_vector, topn=max_num_distractors + 100)
    for candidate in candidates:
        if candidate[0].count('_') == 1:
            distractor_lemma, distractor_pos = candidate[0].split('_')
            distractor_similarity = candidate[1]
            candidate_gender = define_gender(distractor_lemma)
            length_ratio = abs(len(lemma) - len(distractor_lemma))
            condition = ((distractor_pos == pos
                          or (distractor_pos in ('VERB', 'ADJ', 'phrase') and pos in ('VERB', 'ADJ', 'phrase')))
                         and distractor_lemma != lemma
                         and len(distractors) < 100
                         and distractor_similarity < SIMILARITY_VALUES_w2v[level_name]
                         and candidate_gender == gender
                         and length_ratio <= max_length_ratio
                         and distractor_lemma not in global_distractors
                         and edit_distance(lemma, distractor_lemma) / ((len(lemma) + len(distractor_lemma)) / 2) >
                         min_edit_distance_ratio)
            if condition:
                if distractor_minimum is not None:
                    if distractor_lemma in distractor_minimum:
                        distractors.append((distractor_lemma, distractor_similarity))
                        global_distractors.add(distractor_lemma)
                else:
                    distractors.append((distractor_lemma, distractor_similarity))
                    global_distractors.add(distractor_lemma)
        else:
            if candidate[0].count('_') > 3 or pos in ('NOUN', 'ADJ', 'NUM'):
                continue
            d1_lemma, d1_pos, d2_lemma, d2_pos = candidate[0].split('_')
            distractor_lemma = f'{d1_lemma}_{d2_lemma}'
            distractor_similarity = candidate[1]
            condition = (((d1_pos == pos or d2_pos == pos)
                          or (d1_pos in ('VERB', 'AUX', 'SCONJ', 'ADP')
                              and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP'))
                          or (d2_pos in ('VERB', 'AUX', 'SCONJ', 'ADP')
                              and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP')))
                         and candidate[0] != lemma
                         and distractor_lemma != lemma
                         and len(distractors) < 100
                         and distractor_similarity < SIMILARITY_VALUES_w2v[level_name]
                         and distractor_lemma not in global_distractors)
            if condition:
                if distractor_minimum is not None:
                    if (distractor_lemma in distractor_minimum
                            or (d1_lemma in distractor_minimum and d2_lemma in distractor_minimum)):
                        distractors.append((candidate[0], distractor_similarity))
                        global_distractors.add(distractor_lemma)
                else:
                    distractors.append((candidate[0], distractor_similarity))
                    global_distractors.add(distractor_lemma)
    max_num_distractors = min(4, max_num_distractors) if max_num_distractors >= 4 else max_num_distractors
    if len(distractors) >= max_num_distractors:
        return distractors
    else:
        return None


def get_distractors_from_model_bert(model, text_with_masked_task: str, lemma: str, pos: str, gender: str or None,
                                    global_distractors: set, distractor_minimum: set, level_name: str,
                                    max_num_distractors: int, max_length_ratio=5, min_edit_distance_ratio=0.5):
    _distractors = []
    try:
        if distractor_minimum:
            bert_candidates = [token for token in model(text_with_masked_task, top_k=max_num_distractors + 100)]
        else:
            bert_candidates = [token for token in model(text_with_masked_task, top_k=max_num_distractors + 100)]
        inflected_candidates = []
        for candidate in bert_candidates:
            if isinstance(candidate, list):
                bert_candidates = candidate
                continue
            if candidate['token_str'].isalpha():
                candidate_morph = nlp(candidate['token_str'])[0]
                inflected_candidates.append((f"{candidate_morph.lemma_}_{candidate_morph.pos_}", candidate['score']))
    except KeyError:
        return None
    for candidate_distractor in inflected_candidates:
        if '_' in candidate_distractor[0]:
            distractor_lemma, distractor_pos = candidate_distractor[0].split('_')
        else:
            candidate_morph = nlp(candidate_distractor[0])[0]
            distractor_lemma, distractor_pos = candidate_morph.lemma_, candidate_morph.pos_
        distractor_similarity = candidate_distractor[1]
        candidate_gender = define_gender(distractor_lemma)
        length_ratio = abs(len(lemma) - len(distractor_lemma))
        if (((distractor_pos == pos)
             or (pos in ('VERB', 'ADJ', 'phrase') and distractor_pos in ('VERB', 'ADJ', 'phrase')))
                and distractor_lemma != lemma
                and (len(_distractors) < max_num_distractors + 10)
                and (distractor_similarity < SIMILARITY_VALUES_bert[level_name])
                and (candidate_gender == gender)
                and (length_ratio <= max_length_ratio)  # May be changed if case of phrases
                and (distractor_lemma not in global_distractors)
                and (edit_distance(lemma, distractor_lemma)  # May be changed if case of phrases
                     / ((len(lemma) + len(distractor_lemma)) / 2) > min_edit_distance_ratio)):
            if distractor_minimum is not None:
                if distractor_lemma in distractor_minimum:
                    _distractors.append((distractor_lemma, candidate_distractor[1]))
                    global_distractors.add(distractor_lemma)
            else:
                _distractors.append((distractor_lemma, candidate_distractor[1]))
    num_distractors = min(4, max_num_distractors) if max_num_distractors >= 4 else max_num_distractors
    if len(_distractors) < num_distractors:
        return None
    return _distractors


def prepare_tasks(input_variants):
    TASKS_STUDENT = ''
    TASKS_TEACHER = ''
    KEYS_ONLY = ''
    RAW_TASKS = []
    RAW_KEYS_ONLY = []
    RESULT_TASKS_STUDENT = []
    TASKS_WITH_ANSWERS_L = []
    KEYS = []

    for num, item in enumerate(input_variants):
        item = item[0]
        answer = item[0].lower()
        variants = '\t'.join([i.lower() for i in item[1]])
        current_answer_letter = answer_letter(answer=answer, variants=[i.lower() for i in item[1]])
        RAW_TASKS.append((num + 1, variants))
        RAW_KEYS_ONLY.append((num + 1, current_answer_letter.split(' ')[0]))
        RESULT_TASKS_STUDENT.append(f"{num + 1}.\t{variants}")
        TASKS_WITH_ANSWERS_L.append(f"{num + 1}.\t"
                                    f"Ответ: {current_answer_letter}\n\t"
                                    f"Варианты: {variants}")
        KEYS.append(f"{num + 1}.\tОтвет: {current_answer_letter}")

    for task in RESULT_TASKS_STUDENT:
        TASKS_STUDENT += f'{task}\n'
    for task in TASKS_WITH_ANSWERS_L:
        TASKS_TEACHER += f'{task}\n'
    for task in KEYS:
        KEYS_ONLY += f'{task}\n'

    return {'TASKS_STUDENT': TASKS_STUDENT, 'TASKS_TEACHER': TASKS_TEACHER,
            'KEYS_ONLY': KEYS_ONLY, 'RAW_TASKS': RAW_TASKS, 'RAW_KEYS_ONLY': RAW_KEYS_ONLY}