import json import spacy import gensim import pymorphy3 import streamlit as st from pickle import load from transformers import pipeline from summarizer import Summarizer from torch import cuda, device device = device('cpu') @st.cache_resource def load_morph(): _morph = pymorphy3.MorphAnalyzer(lang='ru') return _morph @st.cache_resource def load_w2v(model): with st.spinner('Загружаю языковую модель'): if model == 'model1': model_path = r'language_data/model1.gz' else: model_path = r'language_data/model2.gz' return gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True) @st.cache_resource def load_spacy(): with st.spinner('Загружаю морфо-синтаксический парсер'): _nlp = spacy.load('ru_core_news_lg') return _nlp @st.cache_resource def load_bert(): with st.spinner('Загружаю языковую модель'): _pipeline = pipeline(task="fill-mask", model="a-v-bely/ruBert-base-finetuned-russian-moshkov-child-corpus-pro", device=device) return _pipeline @st.cache_resource def load_summarizer(): return Summarizer() @st.cache_resource def load_classifiers(model): if model == 'model1': scaler_path = 'language_data/model1_no_wn_minmaxscaler.pickle' classifier_path = 'language_data/model1_no_wn_catboost_classifier.pickle' elif model == 'model2': scaler_path = 'language_data/model2_no_wn_minmaxscaler.pickle' classifier_path = 'language_data/model2_no_wn_catboost_classifier.pickle' else: scaler_path = 'language_data/model3_no_wn_minmaxscaler.pickle' classifier_path = 'language_data/model3_no_wn_catboost_classifier.pickle' with (open(scaler_path, 'rb') as f1, open(classifier_path, 'rb') as f2, open('language_data/pos_dict.pickle', 'rb') as f3): scaler = load(f1) classifier = load(f2) pos_dict = load(f3) return pos_dict, scaler, classifier nlp = load_spacy() morph = load_morph() summarization = load_summarizer() w2v_model1_path = r'model1.gz' w2v_model2_path = r'model2.gz' # Upload stop list stop_list = set() with open(r'language_data/stop_words.txt', 'r', encoding='utf-8') as read_file: for line in read_file: stop_list.add(line.strip()) # Upload minimums a1_path, a1_target_set = r'language_data/A1_MINIMUM.txt', set() a2_path, a2_target_set = r'language_data/A2_MINIMUM.txt', set() b1_path, b1_target_set = r'language_data/B1_MINIMUM.txt', set() b2_path, b2_target_set = r'language_data/B2_MINIMUM.txt', set() c1_path, c1_target_set = r'language_data/C1_MINIMUM.txt', set() c2_path, c2_target_set = r'language_data/C2_MINIMUM.txt', set() minimums_paths = (a1_path, a2_path, b1_path, b2_path, c1_path, c2_path) minimums_sets = (a1_target_set, a2_target_set, b1_target_set, b2_target_set, c1_target_set, c2_target_set) for i in range(len(minimums_paths)): with open(minimums_paths[i], 'r', encoding='utf-8') as read_file: for line in read_file: minimums_sets[i].add(line.strip()) MINIMUM_SETS = { 'A1': (a1_target_set, a1_target_set), 'A2': (a2_target_set, a2_target_set.union(a1_target_set)), 'B1': (b1_target_set, b1_target_set.union(a2_target_set)), 'B2': (b2_target_set, b2_target_set.union(b1_target_set)), 'C1': (c1_target_set, c1_target_set.union(b2_target_set)), 'C2': (c2_target_set, c2_target_set.union(c1_target_set)), 'Без уровня': (None, None) } LEVEL_NUMBERS = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6} with open('language_data/phrases.json', 'r', encoding='utf-8') as f: PHRASES = set(json.load(f)['PHRASES']) BAD_USER_TARGET_WORDS = [] COMBINE_POS = { 'simple': { 'A1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], }, 'A2': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], }, 'B1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], }, 'B2': {'VERB': ['AUX'], '': ['VERB'], }, 'C1': {'VERB': ['AUX'], '': ['VERB'], }, 'C2': {'VERB': ['AUX'], '': ['VERB'], }, 'Без уровня': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], } }, 'phrase': { 'A1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], }, 'A2': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], }, 'B1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], }, 'B2': {'VERB': ['AUX'], '': ['VERB'], }, 'C1': {'VERB': ['AUX'], '': ['VERB'], }, 'C2': {'VERB': ['AUX'], '': ['VERB'], }, 'Без уровня': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], } }, }