Spaces:

a-v-bely
/

russian-task-generator

Running

File size: 4,816 Bytes

efbe6b4
 
 
dc801b3
efbe6b4
686cd54
efbe6b4
686cd54
bdd6de8
 
efbe6b4
686f61c
efbe6b4
 
dc801b3
efbe6b4
 
686f61c
efbe6b4
686cd54
 
 
 
 
 
 
efbe6b4
686f61c
 
efbe6b4
686cd54
 
efbe6b4
 
686f61c
efbe6b4
 
686cd54
bdd6de8
686cd54
efbe6b4
 
686cd54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
efbe6b4
 
686cd54
1c4aea6
 
efbe6b4
 
 
 
 
 
 
 
 
 
 
 
 
 
686cd54
 
efbe6b4
 
 
 
 
 
686cd54
 
 
 
 
 
 
 
 
 
 
efbe6b4
 
 
 
 
686cd54
 
 
 
 
92aa5ff
 
 
 
 
 
 
686cd54
 
 
92aa5ff
 
 
 
 
 
 
686cd54

import json
import spacy
import gensim
import pymorphy3
import streamlit as st
from pickle import load
from transformers import pipeline
from summarizer import Summarizer
from torch import cuda, device
device = device('cuda' if cuda.is_available else 'cpu')


@st.cache_resource
def load_morph():
    _morph = pymorphy3.MorphAnalyzer(lang='ru')
    return _morph


@st.cache_resource
def load_w2v(model):
    with st.spinner('Загружаю языковую модель'):
        if model == 'model1':
            model_path = r'language_data/model1.gz'
        else:
            model_path = r'language_data/model2.gz'
    return gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)


@st.cache_resource
def load_spacy():
    with st.spinner('Загружаю морфо-синтаксический парсер'):
        _nlp = spacy.load('ru_core_news_lg')
    return _nlp


@st.cache_resource
def load_bert():
    with st.spinner('Загружаю языковую модель'):
        _pipeline = pipeline(task="fill-mask", model="a-v-bely/ruBert-base-finetuned-russian-moshkov-child-corpus-pro", device=device)
    return _pipeline


@st.cache_resource
def load_summarizer():
    return Summarizer()

@st.cache_resource
def load_classifiers(model):
    if model == 'model1':
        scaler_path = 'language_data/model1_no_wn_minmaxscaler.pickle'
        classifier_path = 'language_data/model1_no_wn_catboost_classifier.pickle'
    elif model == 'model2':
        scaler_path = 'language_data/model2_no_wn_minmaxscaler.pickle'
        classifier_path = 'language_data/model2_no_wn_catboost_classifier.pickle'
    else:
        scaler_path = 'language_data/model3_no_wn_minmaxscaler.pickle'
        classifier_path = 'language_data/model3_no_wn_catboost_classifier.pickle'
    with (open(scaler_path, 'rb') as f1, open(classifier_path, 'rb') as f2, open('language_data/pos_dict.pickle', 'rb') as f3):
        scaler = load(f1)
        classifier = load(f2)
        pos_dict = load(f3)
    return pos_dict, scaler, classifier

nlp = load_spacy()
morph = load_morph()
summarization = load_summarizer()
w2v_model1_path = r'model1.gz'
w2v_model2_path = r'model2.gz'

# Upload stop list
stop_list = set()
with open(r'language_data/stop_words.txt', 'r', encoding='utf-8') as read_file:
    for line in read_file:
        stop_list.add(line.strip())

# Upload minimums
a1_path, a1_target_set = r'language_data/A1_MINIMUM.txt', set()
a2_path, a2_target_set = r'language_data/A2_MINIMUM.txt', set()
b1_path, b1_target_set = r'language_data/B1_MINIMUM.txt', set()
b2_path, b2_target_set = r'language_data/B2_MINIMUM.txt', set()
c1_path, c1_target_set = r'language_data/C1_MINIMUM.txt', set()
c2_path, c2_target_set = r'language_data/C2_MINIMUM.txt', set()

minimums_paths = (a1_path, a2_path, b1_path, b2_path, c1_path, c2_path)
minimums_sets = (a1_target_set, a2_target_set, b1_target_set, b2_target_set, c1_target_set, c2_target_set)
for i in range(len(minimums_paths)):
    with open(minimums_paths[i], 'r', encoding='utf-8') as read_file:
        for line in read_file:
            minimums_sets[i].add(line.strip())

MINIMUM_SETS = {
    'A1': (a1_target_set, a1_target_set),
    'A2': (a2_target_set, a2_target_set.union(a1_target_set)),
    'B1': (b1_target_set, b1_target_set.union(a2_target_set)),
    'B2': (b2_target_set, b2_target_set.union(b1_target_set)),
    'C1': (c1_target_set, c1_target_set.union(b2_target_set)),
    'C2': (c2_target_set, c2_target_set.union(c1_target_set)),
    'Без уровня': (None, None)
}

LEVEL_NUMBERS = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}

with open('language_data/phrases.json', 'r', encoding='utf-8') as f:
    PHRASES = set(json.load(f)['PHRASES'])

BAD_USER_TARGET_WORDS = []


COMBINE_POS = {
    'simple':
    {
        'A1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
        'A2': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
        'B1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
        'B2': {'VERB': ['AUX'], '': ['VERB'], },
        'C1': {'VERB': ['AUX'], '': ['VERB'], },
        'C2': {'VERB': ['AUX'], '': ['VERB'], },
        'Без уровня': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], }
    },
    'phrase': 
    {
        'A1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
        'A2': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
        'B1': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], },
        'B2': {'VERB': ['AUX'], '': ['VERB'], },
        'C1': {'VERB': ['AUX'], '': ['VERB'], },
        'C2': {'VERB': ['AUX'], '': ['VERB'], },
        'Без уровня': {'VERB': ['AUX'], '': ['VERB'], 'ADV':['ADJ'], 'ADJ': ['ADV'], }
    },
}