Spaces:
Running
Running
File size: 4,816 Bytes
efbe6b4 dc801b3 efbe6b4 686cd54 efbe6b4 686cd54 bdd6de8 efbe6b4 686f61c efbe6b4 dc801b3 efbe6b4 686f61c efbe6b4 686cd54 efbe6b4 686f61c efbe6b4 686cd54 efbe6b4 686f61c efbe6b4 686cd54 bdd6de8 686cd54 efbe6b4 686cd54 efbe6b4 686cd54 1c4aea6 efbe6b4 686cd54 efbe6b4 686cd54 efbe6b4 686cd54 92aa5ff 686cd54 92aa5ff 686cd54 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import json
import spacy
import gensim
import pymorphy3
import streamlit as st
from pickle import load
from transformers import pipeline
from summarizer import Summarizer
from torch import cuda, device
device = device('cuda' if cuda.is_available else 'cpu')
@st.cache_resource
def load_morph():
_morph = pymorphy3.MorphAnalyzer(lang='ru')
return _morph
@st.cache_resource
def load_w2v(model):
with st.spinner('Загружаю языковую модель'):
if model == 'model1':
model_path = r'language_data/model1.gz'
else:
model_path = r'language_data/model2.gz'
return gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)
@st.cache_resource
def load_spacy():
with st.spinner('Загружаю морфо-синтаксический парсер'):
_nlp = spacy.load('ru_core_news_lg')
return _nlp
@st.cache_resource
def load_bert():
with st.spinner('Загружаю языковую модель'):
_pipeline = pipeline(task="fill-mask", model="a-v-bely/ruBert-base-finetuned-russian-moshkov-child-corpus-pro", device=device)
return _pipeline
@st.cache_resource
def load_summarizer():
return Summarizer()
@st.cache_resource
def load_classifiers(model):
if model == 'model1':
scaler_path = 'language_data/model1_no_wn_minmaxscaler.pickle'
classifier_path = 'language_data/model1_no_wn_catboost_classifier.pickle'
elif model == 'model2':
scaler_path = 'language_data/model2_no_wn_minmaxscaler.pickle'
classifier_path = 'language_data/model2_no_wn_catboost_classifier.pickle'
else:
scaler_path = 'language_data/model3_no_wn_minmaxscaler.pickle'
classifier_path = 'language_data/model3_no_wn_catboost_classifier.pickle'
with (open(scaler_path, 'rb') as f1, open(classifier_path, 'rb') as f2, open('language_data/pos_dict.pickle', 'rb') as f3):
scaler = load(f1)
classifier = load(f2)
pos_dict = load(f3)
return pos_dict, scaler, classifier
nlp = load_spacy()
morph = load_morph()
summarization = load_summarizer()
w2v_model1_path = r'model1.gz'
w2v_model2_path = r'model2.gz'
# Upload stop list
stop_list = set()
with open(r'language_data/stop_words.txt', 'r', encoding='utf-8') as read_file:
for line in read_file:
stop_list.add(line.strip())
# Upload minimums
a1_path, a1_target_set = r'language_data/A1_MINIMUM.txt', set()
a2_path, a2_target_set = r'language_data/A2_MINIMUM.txt', set()
b1_path, b1_target_set = r'language_data/B1_MINIMUM.txt', set()
b2_path, b2_target_set = r'language_data/B2_MINIMUM.txt', set()
c1_path, c1_target_set = r'language_data/C1_MINIMUM.txt', set()
c2_path, c2_target_set = r'language_data/C2_MINIMUM.txt', set()
minimums_paths = (a1_path, a2_path, b1_path, b2_path, c1_path, c2_path)
minimums_sets = (a1_target_set, a2_target_set, b1_target_set, b2_target_set, c1_target_set, c2_target_set)
for i in range(len(minimums_paths)):
with open(minimums_paths[i], 'r', encoding='utf-8') as read_file:
for line in read_file:
minimums_sets[i].add(line.strip())
MINIMUM_SETS = {
'A1': (a1_target_set, a1_target_set),
'A2': (a2_target_set, a2_target_set.union(a1_target_set)),
'B1': (b1_target_set, b1_target_set.union(a2_target_set)),
'B2': (b2_target_set, b2_target_set.union(b1_target_set)),
'C1': (c1_target_set, c1_target_set.union(b2_target_set)),
'C2': (c2_target_set, c2_target_set.union(c1_target_set)),
'Без уровня': (None, None)
}
LEVEL_NUMBERS = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}
with open('language_data/phrases.json', 'r', encoding='utf-8') as f:
PHRASES = set(json.load(f)['PHRASES'])
BAD_USER_TARGET_WORDS = []
COMBINE_POS = {
'simple':
{
'A1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
'A2': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
'B1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
'B2': {'VERB': ['AUX'], '': ['VERB'], },
'C1': {'VERB': ['AUX'], '': ['VERB'], },
'C2': {'VERB': ['AUX'], '': ['VERB'], },
'Без уровня': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], }
},
'phrase':
{
'A1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
'A2': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
'B1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
'B2': {'VERB': ['AUX'], '': ['VERB'], },
'C1': {'VERB': ['AUX'], '': ['VERB'], },
'C2': {'VERB': ['AUX'], '': ['VERB'], },
'Без уровня': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], }
},
}
|