Spaces:
Building
Building
File size: 6,103 Bytes
2609fac ea7c789 2609fac 1156b6f ee27ed8 2609fac ea7c789 41e198b ea7c789 2609fac 41e198b 2609fac 41e198b ee27ed8 41e198b 2609fac ea7c789 1156b6f 2609fac ea7c789 2609fac 1156b6f 2609fac ea7c789 2609fac ea7c789 2609fac 1156b6f ea7c789 1156b6f ea7c789 1156b6f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
import json
import spacy
import gensim
import streamlit as st
from pickle import load
from transformers import pipeline
from summarizer import Summarizer
from torch import cuda, device
device = device('cuda' if cuda.is_available else 'cpu')
@st.cache_resource
def load_w2v(model):
with st.spinner('Загружаю языковую модель'):
if model == 'model1':
model_path = r'language_data/model1.gz'
else:
model_path = r'language_data/model2.gz'
return gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)
@st.cache_resource
def load_spacy():
with st.spinner('Загружаю морфо-синтаксический парсер'):
_nlp = spacy.load('es_core_news_lg')
return _nlp
@st.cache_resource
def load_bert():
with st.spinner('Загружаю языковую модель'):
_pipeline = pipeline(task="fill-mask", model="a-v-white/bert-base-spanish-wwm-cased-finetuned-literature-pro", device=device)
return _pipeline
@st.cache_resource
def load_summarizer():
return Summarizer()
@st.cache_resource
def load_classifiers(model):
if model == 'model1':
scaler_path = 'language_data/model1_with_wn_minmaxscaler.pickle'
classifier_path = 'language_data/model1_with_wn_catboost_classifier.pickle'
elif model == 'model2':
scaler_path = 'language_data/model2_with_wn_minmaxscaler.pickle'
classifier_path = 'language_data/model2_with_wn_catboost_classifier.pickle'
else:
scaler_path = 'language_data/model3_with_wn_minmaxscaler.pickle'
classifier_path = 'language_data/model3_with_wn_catboost_classifier.pickle'
with (open(scaler_path, 'rb') as f1, open(classifier_path, 'rb') as f2, open('language_data/pos_dict.pickle', 'rb') as f3):
scaler = load(f1)
classifier = load(f2)
pos_dict = load(f3)
return pos_dict, scaler, classifier
nlp = load_spacy()
summarization = load_summarizer()
# Upload minimums
a1_path, a1_target_set = r'lexical_minimums/A1_MINIMUM.txt', set()
a2_path, a2_target_set = r'lexical_minimums/A2_MINIMUM.txt', set()
b1_path, b1_target_set = r'lexical_minimums/B1_MINIMUM.txt', set()
b2_path, b2_target_set = r'lexical_minimums/B2_MINIMUM.txt', set()
c1_path, c1_target_set = r'lexical_minimums/C1_MINIMUM.txt', set()
c2_path, c2_target_set = r'lexical_minimums/C2_MINIMUM.txt', set()
minimums_paths = (a1_path, a2_path, b1_path, b2_path, c1_path, c2_path)
minimums_sets = (a1_target_set, a2_target_set, b1_target_set, b2_target_set, c1_target_set, c2_target_set)
for i in range(len(minimums_paths)):
with open(minimums_paths[i], 'r', encoding='utf-8') as read_file:
for line in read_file:
minimums_sets[i].add(line.strip())
MINIMUM_SETS = {
'A1': (a1_target_set, a1_target_set),
'A2': (a2_target_set, a2_target_set.union(a1_target_set)),
'B1': (b1_target_set, b1_target_set.union(a2_target_set)),
'B2': (b2_target_set, b2_target_set.union(b1_target_set)),
'C1': (c1_target_set, c1_target_set.union(b2_target_set)),
'C2': (c2_target_set, c2_target_set.union(c1_target_set)),
'Без уровня': (None, None)
}
LEVEL_NUMBERS = {'A1': 1, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 4}
with open('language_data/phrases.json', 'r', encoding='utf-8') as f:
PHRASES = set(json.load(f)['PHRASES'])
with open('language_data/fix_irregular_lemma.json', 'r', encoding='utf-8') as f:
FIX_LEMMA = json.load(f)
BAD_USER_TARGET_WORDS = []
COMBINE_POS = {
'simple':
{
'A1': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX'], 'AUX': ['VERB']},
'A2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX'], 'AUX': ['VERB']},
'B1': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX'], 'AUX': ['VERB']},
'B2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'],
'AUX_ADJ': ['PRON_VERB'], 'PRON_VERB': ['AUX_ADJ'], 'ADP': ['SCONJ', 'ADV'], },
'C1': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'],
'ADJ':['NOUN'], 'NOUN': ['ADJ']},
'C2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'],
'ADJ':['NOUN'], 'NOUN': ['ADJ']},
'Без уровня': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'],
'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'], 'AUX_ADJ': ['PRON_VERB'], 'PRON_VERB': ['AUX_ADJ'], 'ADP': ['SCONJ', 'ADV'], 'ADJ': ['NOUN'], 'NOUN': ['ADJ']}
},
'phrase':
{
'A1': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX'], 'AUX': ['VERB']},
'A2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX'], 'AUX': ['VERB']},
'B1': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX'], 'AUX': ['VERB']},
'B2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'],
'AUX_ADJ': ['PRON_VERB'], 'PRON_VERB': ['AUX_ADJ'], 'ADP': ['SCONJ', 'ADV'], },
'C1': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'],
'ADJ':['NOUN'], 'NOUN': ['ADJ']},
'C2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'],
'ADJ':['NOUN'], 'NOUN': ['ADJ']},
'Без уровня': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'],
'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'], 'AUX_ADJ': ['PRON_VERB'], 'PRON_VERB': ['AUX_ADJ'],
'ADP': ['SCONJ', 'ADV'], 'ADJ': ['NOUN'], 'NOUN': ['ADJ']}
},
} |