File size: 6,103 Bytes
2609fac
 
 
 
ea7c789
2609fac
1156b6f
ee27ed8
 
2609fac
 
 
ea7c789
41e198b
ea7c789
 
 
 
 
2609fac
 
 
 
41e198b
 
2609fac
 
 
 
 
41e198b
ee27ed8
41e198b
2609fac
ea7c789
1156b6f
 
 
2609fac
ea7c789
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2609fac
1156b6f
2609fac
 
 
 
 
 
 
 
ea7c789
 
2609fac
 
 
 
 
 
ea7c789
 
 
 
 
 
 
 
 
 
 
2609fac
 
 
 
 
 
 
 
1156b6f
 
 
 
 
 
 
 
 
 
 
 
 
 
ea7c789
 
1156b6f
 
 
 
 
 
 
 
 
 
 
 
ea7c789
 
 
1156b6f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import json
import spacy
import gensim
import streamlit as st
from pickle import load
from transformers import pipeline
from summarizer import Summarizer
from torch import cuda, device
device = device('cuda' if cuda.is_available else 'cpu')


@st.cache_resource
def load_w2v(model):
    with st.spinner('Загружаю языковую модель'):
        if model == 'model1':
            model_path = r'language_data/model1.gz'
        else:
            model_path = r'language_data/model2.gz'
    return gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)


@st.cache_resource
def load_spacy():
    with st.spinner('Загружаю морфо-синтаксический парсер'):
        _nlp = spacy.load('es_core_news_lg')
    return _nlp


@st.cache_resource
def load_bert():
    with st.spinner('Загружаю языковую модель'):
        _pipeline = pipeline(task="fill-mask", model="a-v-white/bert-base-spanish-wwm-cased-finetuned-literature-pro", device=device)
    return _pipeline


@st.cache_resource
def load_summarizer():
    return Summarizer()


@st.cache_resource
def load_classifiers(model):
    if model == 'model1':
        scaler_path = 'language_data/model1_with_wn_minmaxscaler.pickle'
        classifier_path = 'language_data/model1_with_wn_catboost_classifier.pickle'
    elif model == 'model2':
        scaler_path = 'language_data/model2_with_wn_minmaxscaler.pickle'
        classifier_path = 'language_data/model2_with_wn_catboost_classifier.pickle'
    else:
        scaler_path = 'language_data/model3_with_wn_minmaxscaler.pickle'
        classifier_path = 'language_data/model3_with_wn_catboost_classifier.pickle'
    with (open(scaler_path, 'rb') as f1, open(classifier_path, 'rb') as f2, open('language_data/pos_dict.pickle', 'rb') as f3):
        scaler = load(f1)
        classifier = load(f2)
        pos_dict = load(f3)
    return pos_dict, scaler, classifier

nlp = load_spacy()
summarization = load_summarizer()

# Upload minimums
a1_path, a1_target_set = r'lexical_minimums/A1_MINIMUM.txt', set()
a2_path, a2_target_set = r'lexical_minimums/A2_MINIMUM.txt', set()
b1_path, b1_target_set = r'lexical_minimums/B1_MINIMUM.txt', set()
b2_path, b2_target_set = r'lexical_minimums/B2_MINIMUM.txt', set()
c1_path, c1_target_set = r'lexical_minimums/C1_MINIMUM.txt', set()
c2_path, c2_target_set = r'lexical_minimums/C2_MINIMUM.txt', set()

minimums_paths = (a1_path, a2_path, b1_path, b2_path, c1_path, c2_path)
minimums_sets = (a1_target_set, a2_target_set, b1_target_set, b2_target_set, c1_target_set, c2_target_set)
for i in range(len(minimums_paths)):
    with open(minimums_paths[i], 'r', encoding='utf-8') as read_file:
        for line in read_file:
            minimums_sets[i].add(line.strip())

MINIMUM_SETS = {
    'A1': (a1_target_set, a1_target_set),
    'A2': (a2_target_set, a2_target_set.union(a1_target_set)),
    'B1': (b1_target_set, b1_target_set.union(a2_target_set)),
    'B2': (b2_target_set, b2_target_set.union(b1_target_set)),
    'C1': (c1_target_set, c1_target_set.union(b2_target_set)),
    'C2': (c2_target_set, c2_target_set.union(c1_target_set)),
    'Без уровня': (None, None)
}

LEVEL_NUMBERS = {'A1': 1, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 4}

with open('language_data/phrases.json', 'r', encoding='utf-8') as f:
    PHRASES = set(json.load(f)['PHRASES'])

with open('language_data/fix_irregular_lemma.json', 'r', encoding='utf-8') as f:
    FIX_LEMMA = json.load(f)

BAD_USER_TARGET_WORDS = []


COMBINE_POS = {
    'simple':
    {
        'A1': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX'], 'AUX': ['VERB']},
        'A2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX'], 'AUX': ['VERB']},
        'B1': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX'], 'AUX': ['VERB']},
        'B2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'], 
               'AUX_ADJ': ['PRON_VERB'], 'PRON_VERB': ['AUX_ADJ'], 'ADP': ['SCONJ', 'ADV'], },
        'C1': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'], 
                'ADJ':['NOUN'], 'NOUN': ['ADJ']},
        'C2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'], 
                'ADJ':['NOUN'], 'NOUN': ['ADJ']},
        'Без уровня': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 
                       'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'], 'AUX_ADJ': ['PRON_VERB'], 'PRON_VERB': ['AUX_ADJ'],                        'ADP': ['SCONJ', 'ADV'], 'ADJ': ['NOUN'], 'NOUN': ['ADJ']}
    },
    'phrase': 
    {
        'A1': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX'], 'AUX': ['VERB']},
        'A2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX'], 'AUX': ['VERB']},
        'B1': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX'], 'AUX': ['VERB']},
        'B2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'], 
               'AUX_ADJ': ['PRON_VERB'], 'PRON_VERB': ['AUX_ADJ'], 'ADP': ['SCONJ', 'ADV'], },
        'C1': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'], 
                'ADJ':['NOUN'], 'NOUN': ['ADJ']},
        'C2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'], 
                'ADJ':['NOUN'], 'NOUN': ['ADJ']},
        'Без уровня': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 
                       'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'], 'AUX_ADJ': ['PRON_VERB'], 'PRON_VERB': ['AUX_ADJ'], 
                       'ADP': ['SCONJ', 'ADV'], 'ADJ': ['NOUN'], 'NOUN': ['ADJ']}
    },
}