Spaces:
Sleeping
Sleeping
a-v-bely
commited on
Commit
·
1156b6f
1
Parent(s):
736e3e5
Update (summarization & pos combinations)
Browse files- pages/4_📝_Онлайн-тест.py +1 -1
- requirements.txt +1 -0
- utilities/__pycache__/utils.cpython-310.pyc +0 -0
- utilities_cookies/__pycache__/cookie_manager.cpython-310.pyc +0 -0
- utilities_cookies/__pycache__/encrypted_cookie_manager.cpython-310.pyc +0 -0
- utilities_database/__pycache__/user_database_utils.cpython-310.pyc +0 -0
- utilities_database/__pycache__/user_database_widgets.cpython-310.pyc +0 -0
- utilities_language_bert/__pycache__/esp_main_workflow_bert.cpython-310.pyc +0 -0
- utilities_language_bert/__pycache__/esp_sentence_bert.cpython-310.pyc +0 -0
- utilities_language_bert/esp_main_workflow_bert.py +20 -1
- utilities_language_bert/esp_sentence_bert.py +10 -7
- utilities_language_general/__pycache__/esp_constants.cpython-310.pyc +0 -0
- utilities_language_general/__pycache__/esp_utils.cpython-310.pyc +0 -0
- utilities_language_general/__pycache__/morphology.cpython-310.pyc +0 -0
- utilities_language_general/esp_constants.py +33 -3
- utilities_language_general/esp_utils.py +13 -10
- utilities_language_w2v/__pycache__/esp_main_workflow_w2v.cpython-310.pyc +0 -0
- utilities_language_w2v/__pycache__/esp_sentence_w2v.cpython-310.pyc +0 -0
- utilities_language_w2v/esp_main_workflow_w2v.py +25 -2
- utilities_language_w2v/esp_sentence_w2v.py +10 -7
- utilities_option_menu/__pycache__/option_menu.cpython-310.pyc +0 -0
- utilities_ui/__pycache__/custom_download_button.cpython-310.pyc +0 -0
pages/4_📝_Онлайн-тест.py
CHANGED
@@ -43,7 +43,7 @@ if st.session_state.get('-ONLINE_TEST_READY-') and st.session_state.get('-LOGGED
|
|
43 |
use_container_width=True)
|
44 |
COMMENTS = ONLINE_TEST.text_area(label='**Прокомментировать**',
|
45 |
placeholder='Напишите комментарий')
|
46 |
-
SUBMIT = ONLINE_TEST.form_submit_button('
|
47 |
if SUBMIT:
|
48 |
points = test_mark = 'Teacher'
|
49 |
appropriate_tasks = BAD_DISTRACTORS_AND_ANSWERS_temp["Задание уместно"].values.tolist()
|
|
|
43 |
use_container_width=True)
|
44 |
COMMENTS = ONLINE_TEST.text_area(label='**Прокомментировать**',
|
45 |
placeholder='Напишите комментарий')
|
46 |
+
SUBMIT = ONLINE_TEST.form_submit_button('ГОТОВО')
|
47 |
if SUBMIT:
|
48 |
points = test_mark = 'Teacher'
|
49 |
appropriate_tasks = BAD_DISTRACTORS_AND_ANSWERS_temp["Задание уместно"].values.tolist()
|
requirements.txt
CHANGED
@@ -11,4 +11,5 @@ argon2-cffi>=21.3.0
|
|
11 |
cryptography>=42.0.3
|
12 |
transformers>=4.37.2
|
13 |
streamlit-extras>=0.4.0
|
|
|
14 |
es_core_news_lg @ https://github.com/explosion/spacy-models/releases/download/es_core_news_lg-3.7.0/es_core_news_lg-3.7.0-py3-none-any.whl
|
|
|
11 |
cryptography>=42.0.3
|
12 |
transformers>=4.37.2
|
13 |
streamlit-extras>=0.4.0
|
14 |
+
bert-extractive-summarizer>=0.10.1
|
15 |
es_core_news_lg @ https://github.com/explosion/spacy-models/releases/download/es_core_news_lg-3.7.0/es_core_news_lg-3.7.0-py3-none-any.whl
|
utilities/__pycache__/utils.cpython-310.pyc
CHANGED
Binary files a/utilities/__pycache__/utils.cpython-310.pyc and b/utilities/__pycache__/utils.cpython-310.pyc differ
|
|
utilities_cookies/__pycache__/cookie_manager.cpython-310.pyc
CHANGED
Binary files a/utilities_cookies/__pycache__/cookie_manager.cpython-310.pyc and b/utilities_cookies/__pycache__/cookie_manager.cpython-310.pyc differ
|
|
utilities_cookies/__pycache__/encrypted_cookie_manager.cpython-310.pyc
CHANGED
Binary files a/utilities_cookies/__pycache__/encrypted_cookie_manager.cpython-310.pyc and b/utilities_cookies/__pycache__/encrypted_cookie_manager.cpython-310.pyc differ
|
|
utilities_database/__pycache__/user_database_utils.cpython-310.pyc
CHANGED
Binary files a/utilities_database/__pycache__/user_database_utils.cpython-310.pyc and b/utilities_database/__pycache__/user_database_utils.cpython-310.pyc differ
|
|
utilities_database/__pycache__/user_database_widgets.cpython-310.pyc
CHANGED
Binary files a/utilities_database/__pycache__/user_database_widgets.cpython-310.pyc and b/utilities_database/__pycache__/user_database_widgets.cpython-310.pyc differ
|
|
utilities_language_bert/__pycache__/esp_main_workflow_bert.cpython-310.pyc
CHANGED
Binary files a/utilities_language_bert/__pycache__/esp_main_workflow_bert.cpython-310.pyc and b/utilities_language_bert/__pycache__/esp_main_workflow_bert.cpython-310.pyc differ
|
|
utilities_language_bert/__pycache__/esp_sentence_bert.cpython-310.pyc
CHANGED
Binary files a/utilities_language_bert/__pycache__/esp_sentence_bert.cpython-310.pyc and b/utilities_language_bert/__pycache__/esp_sentence_bert.cpython-310.pyc differ
|
|
utilities_language_bert/esp_main_workflow_bert.py
CHANGED
@@ -11,11 +11,13 @@ from utilities_language_general.esp_utils import prepare_tasks
|
|
11 |
from utilities_language_general.esp_constants import load_bert
|
12 |
from streamlit.runtime.uploaded_file_manager import UploadedFile
|
13 |
import utilities_language_general.esp_constants as esp_constants
|
|
|
14 |
from utilities_language_general.esp_utils import prepare_target_words
|
15 |
from utilities_language_general.esp_utils import compute_frequency_dict
|
16 |
from utilities_language_general.esp_constants import BAD_USER_TARGET_WORDS
|
17 |
|
18 |
|
|
|
19 |
def main_workflow(
|
20 |
file: UploadedFile or None,
|
21 |
text: str,
|
@@ -137,6 +139,22 @@ def main_workflow(
|
|
137 |
logs.update(label="Запускаем процесс генерации заданий!", state='running')
|
138 |
progress.progress(20)
|
139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
for sentence in workflow:
|
141 |
sentence.lemmatize_sentence()
|
142 |
|
@@ -149,7 +167,8 @@ def main_workflow(
|
|
149 |
sentence.search_target_words(target_words_automatic_mode=tw_mode_automatic_mode,
|
150 |
target_minimum=target_minimum,
|
151 |
user_target_words=USER_TARGET_WORDS,
|
152 |
-
frequency_dict=FREQ_DICT
|
|
|
153 |
progress.progress(int(30 + (j * (20 / len(workflow)))))
|
154 |
progress.progress(50)
|
155 |
DUPLICATE_TARGET_WORDS = defaultdict(list)
|
|
|
11 |
from utilities_language_general.esp_constants import load_bert
|
12 |
from streamlit.runtime.uploaded_file_manager import UploadedFile
|
13 |
import utilities_language_general.esp_constants as esp_constants
|
14 |
+
from utilities_language_general.esp_constants import summarization
|
15 |
from utilities_language_general.esp_utils import prepare_target_words
|
16 |
from utilities_language_general.esp_utils import compute_frequency_dict
|
17 |
from utilities_language_general.esp_constants import BAD_USER_TARGET_WORDS
|
18 |
|
19 |
|
20 |
+
|
21 |
def main_workflow(
|
22 |
file: UploadedFile or None,
|
23 |
text: str,
|
|
|
139 |
logs.update(label="Запускаем процесс генерации заданий!", state='running')
|
140 |
progress.progress(20)
|
141 |
|
142 |
+
# Define summary length
|
143 |
+
text_length = len(current_text_sentences)
|
144 |
+
if text_length <= 15:
|
145 |
+
summary_length = text_length
|
146 |
+
elif text_length <= 25:
|
147 |
+
summary_length = 15
|
148 |
+
else:
|
149 |
+
n = (text_length - 20) // 5
|
150 |
+
summary_length = 15 + 2 * n
|
151 |
+
round_summary_length = summary_length - (summary_length % - 10)
|
152 |
+
|
153 |
+
# Get summary. May choose between round_summary_length and summary_length
|
154 |
+
SUMMARY = summarization(current_text, num_sentences=round_summary_length)
|
155 |
+
logs.success('Нашли интересные предложения. Пригодятся!')
|
156 |
+
progress.progress(25)
|
157 |
+
|
158 |
for sentence in workflow:
|
159 |
sentence.lemmatize_sentence()
|
160 |
|
|
|
167 |
sentence.search_target_words(target_words_automatic_mode=tw_mode_automatic_mode,
|
168 |
target_minimum=target_minimum,
|
169 |
user_target_words=USER_TARGET_WORDS,
|
170 |
+
frequency_dict=FREQ_DICT,
|
171 |
+
summary=SUMMARY)
|
172 |
progress.progress(int(30 + (j * (20 / len(workflow)))))
|
173 |
progress.progress(50)
|
174 |
DUPLICATE_TARGET_WORDS = defaultdict(list)
|
utilities_language_bert/esp_sentence_bert.py
CHANGED
@@ -48,7 +48,7 @@ class SENTENCE:
|
|
48 |
self.sentence_phrases.append(self.sentence_lemma_pos[i][1])
|
49 |
previous_was_phrase = False
|
50 |
|
51 |
-
def search_target_words_automatically(self, target_minimum: set, frequency_dict: dict = None):
|
52 |
for token in self.sentence_phrases:
|
53 |
if isinstance(token, list): # if token is a phrase
|
54 |
original_token1 = token[1]['original_token1']
|
@@ -79,7 +79,8 @@ class SENTENCE:
|
|
79 |
'tags': tags,
|
80 |
'position_in_sentence': self.original.find(original_token1.text),
|
81 |
'not_named_entity': not_ner,
|
82 |
-
'frequency_in_text': 0
|
|
|
83 |
}
|
84 |
self.target_words.append(target_word)
|
85 |
else: # if token is just a spacy.nlp token
|
@@ -98,10 +99,11 @@ class SENTENCE:
|
|
98 |
'position_in_sentence': self.original.find(token.text),
|
99 |
'not_named_entity': True if token.ent_type == 0 else False,
|
100 |
'frequency_in_text': frequency_dict.get(token.lemma_, 1),
|
|
|
101 |
}
|
102 |
self.target_words.append(target_word)
|
103 |
|
104 |
-
def search_user_target_words(self, user_target_words: set = None, frequency_dict: dict = None):
|
105 |
for _utw in user_target_words:
|
106 |
if _utw in self.original:
|
107 |
parse_utw = nlp(_utw)
|
@@ -137,19 +139,20 @@ class SENTENCE:
|
|
137 |
'tags': user_target_word_tags,
|
138 |
'position_in_sentence': self.original.find(_utw),
|
139 |
'not_named_entity': not_ner,
|
140 |
-
'frequency_in_text': frequency_dict.get(user_target_word_lemma, 1)
|
|
|
141 |
}
|
142 |
self.target_words.append(target_word)
|
143 |
|
144 |
def search_target_words(self, target_words_automatic_mode: bool, target_minimum,
|
145 |
user_target_words: set = None,
|
146 |
-
frequency_dict: dict = None):
|
147 |
if target_words_automatic_mode:
|
148 |
self.search_target_words_automatically(target_minimum=target_minimum,
|
149 |
-
frequency_dict=frequency_dict)
|
150 |
else:
|
151 |
self.search_user_target_words(user_target_words=user_target_words,
|
152 |
-
frequency_dict=frequency_dict)
|
153 |
|
154 |
def filter_target_words(self, target_words_automatic_mode):
|
155 |
c_position = 0
|
|
|
48 |
self.sentence_phrases.append(self.sentence_lemma_pos[i][1])
|
49 |
previous_was_phrase = False
|
50 |
|
51 |
+
def search_target_words_automatically(self, target_minimum: set, frequency_dict: dict = None, summary:list=None):
|
52 |
for token in self.sentence_phrases:
|
53 |
if isinstance(token, list): # if token is a phrase
|
54 |
original_token1 = token[1]['original_token1']
|
|
|
79 |
'tags': tags,
|
80 |
'position_in_sentence': self.original.find(original_token1.text),
|
81 |
'not_named_entity': not_ner,
|
82 |
+
'frequency_in_text': 0,
|
83 |
+
'in_summary': self.original in summary
|
84 |
}
|
85 |
self.target_words.append(target_word)
|
86 |
else: # if token is just a spacy.nlp token
|
|
|
99 |
'position_in_sentence': self.original.find(token.text),
|
100 |
'not_named_entity': True if token.ent_type == 0 else False,
|
101 |
'frequency_in_text': frequency_dict.get(token.lemma_, 1),
|
102 |
+
'in_summary': self.original in summary
|
103 |
}
|
104 |
self.target_words.append(target_word)
|
105 |
|
106 |
+
def search_user_target_words(self, user_target_words: set = None, frequency_dict: dict = None, summary:list=None):
|
107 |
for _utw in user_target_words:
|
108 |
if _utw in self.original:
|
109 |
parse_utw = nlp(_utw)
|
|
|
139 |
'tags': user_target_word_tags,
|
140 |
'position_in_sentence': self.original.find(_utw),
|
141 |
'not_named_entity': not_ner,
|
142 |
+
'frequency_in_text': frequency_dict.get(user_target_word_lemma, 1),
|
143 |
+
'in_summary': self.original in summary
|
144 |
}
|
145 |
self.target_words.append(target_word)
|
146 |
|
147 |
def search_target_words(self, target_words_automatic_mode: bool, target_minimum,
|
148 |
user_target_words: set = None,
|
149 |
+
frequency_dict: dict = None, summary:list=None):
|
150 |
if target_words_automatic_mode:
|
151 |
self.search_target_words_automatically(target_minimum=target_minimum,
|
152 |
+
frequency_dict=frequency_dict, summary=summary)
|
153 |
else:
|
154 |
self.search_user_target_words(user_target_words=user_target_words,
|
155 |
+
frequency_dict=frequency_dict, summary=summary)
|
156 |
|
157 |
def filter_target_words(self, target_words_automatic_mode):
|
158 |
c_position = 0
|
utilities_language_general/__pycache__/esp_constants.cpython-310.pyc
CHANGED
Binary files a/utilities_language_general/__pycache__/esp_constants.cpython-310.pyc and b/utilities_language_general/__pycache__/esp_constants.cpython-310.pyc differ
|
|
utilities_language_general/__pycache__/esp_utils.cpython-310.pyc
CHANGED
Binary files a/utilities_language_general/__pycache__/esp_utils.cpython-310.pyc and b/utilities_language_general/__pycache__/esp_utils.cpython-310.pyc differ
|
|
utilities_language_general/__pycache__/morphology.cpython-310.pyc
CHANGED
Binary files a/utilities_language_general/__pycache__/morphology.cpython-310.pyc and b/utilities_language_general/__pycache__/morphology.cpython-310.pyc differ
|
|
utilities_language_general/esp_constants.py
CHANGED
@@ -3,6 +3,7 @@ import spacy
|
|
3 |
import gensim
|
4 |
import streamlit as st
|
5 |
from transformers import pipeline
|
|
|
6 |
|
7 |
|
8 |
@st.cache_resource
|
@@ -25,8 +26,12 @@ def load_bert():
|
|
25 |
_pipeline = pipeline(task="fill-mask", model="a-v-white/bert-base-spanish-wwm-cased-finetuned-literature-pro")
|
26 |
return _pipeline
|
27 |
|
|
|
|
|
|
|
28 |
|
29 |
nlp = load_spacy()
|
|
|
30 |
w2v_model_1_path = r'model1.gz'
|
31 |
w2v_model_2_path = r'model2.gz'
|
32 |
|
@@ -57,7 +62,32 @@ with open('language_data/phrases.json', 'r', encoding='utf-8') as f:
|
|
57 |
with open('language_data/fix_irregular_lemma.json', 'r', encoding='utf-8') as f:
|
58 |
FIX_LEMMA = json.load(f)
|
59 |
|
60 |
-
SIMILARITY_VALUES = {'A1': 1.0, 'A2': 1.0, 'B1': 1.0, 'B2': 1.0, 'C1': 1.0, 'C2': 1.0, 'Без уровня': 1.0}
|
61 |
-
SIMILARITY_VALUES_bert = {'A1': 1.0, 'A2': 1.0, 'B1': 1.0, 'B2': 1.0, 'C1': 1.0, 'C2': 1.0, 'Без уровня': 1.0}
|
62 |
-
|
63 |
BAD_USER_TARGET_WORDS = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
import gensim
|
4 |
import streamlit as st
|
5 |
from transformers import pipeline
|
6 |
+
from summarizer import Summarizer
|
7 |
|
8 |
|
9 |
@st.cache_resource
|
|
|
26 |
_pipeline = pipeline(task="fill-mask", model="a-v-white/bert-base-spanish-wwm-cased-finetuned-literature-pro")
|
27 |
return _pipeline
|
28 |
|
29 |
+
@st.cache_resource
|
30 |
+
def load_summarizer():
|
31 |
+
return Summarizer()
|
32 |
|
33 |
nlp = load_spacy()
|
34 |
+
summarization = load_summarizer()
|
35 |
w2v_model_1_path = r'model1.gz'
|
36 |
w2v_model_2_path = r'model2.gz'
|
37 |
|
|
|
62 |
with open('language_data/fix_irregular_lemma.json', 'r', encoding='utf-8') as f:
|
63 |
FIX_LEMMA = json.load(f)
|
64 |
|
|
|
|
|
|
|
65 |
BAD_USER_TARGET_WORDS = []
|
66 |
+
|
67 |
+
|
68 |
+
COMBINE_POS = {
|
69 |
+
'simple':
|
70 |
+
{
|
71 |
+
'A1': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX'], 'AUX': ['VERB']},
|
72 |
+
'A2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX'], 'AUX': ['VERB']},
|
73 |
+
'B1': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX'], 'AUX': ['VERB']},
|
74 |
+
'B2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'],
|
75 |
+
'AUX_ADJ': ['PRON_VERB'], 'PRON_VERB': ['AUX_ADJ'], 'ADP': ['SCONJ', 'ADV'], },
|
76 |
+
'C1': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'],
|
77 |
+
'ADJ':['NOUN'], 'NOUN': ['ADJ']},
|
78 |
+
'C2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'],
|
79 |
+
'ADJ':['NOUN'], 'NOUN': ['ADJ']},
|
80 |
+
},
|
81 |
+
'phrase':
|
82 |
+
{
|
83 |
+
'A1': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX'], 'AUX': ['VERB']},
|
84 |
+
'A2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX'], 'AUX': ['VERB']},
|
85 |
+
'B1': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX'], 'AUX': ['VERB']},
|
86 |
+
'B2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'],
|
87 |
+
'AUX_ADJ': ['PRON_VERB'], 'PRON_VERB': ['AUX_ADJ'], 'ADP': ['SCONJ', 'ADV'], },
|
88 |
+
'C1': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'],
|
89 |
+
'ADJ':['NOUN'], 'NOUN': ['ADJ']},
|
90 |
+
'C2': {'PRON': ['DET'], 'DET': ['PRON'], 'VERB': ['AUX', 'AUX_VERB'], 'AUX': ['VERB', 'AUX_VERB'], 'AUX_VERB': ['VERB', 'AUX'], 'AUX_AUX': ['AUX'],
|
91 |
+
'ADJ':['NOUN'], 'NOUN': ['ADJ']},
|
92 |
+
},
|
93 |
+
}
|
utilities_language_general/esp_utils.py
CHANGED
@@ -2,8 +2,7 @@ from nltk import edit_distance
|
|
2 |
from utilities.utils import answer_letter
|
3 |
from utilities_language_general.esp_constants import nlp
|
4 |
from utilities_language_general.esp_constants import FIX_LEMMA
|
5 |
-
from utilities_language_general.esp_constants import
|
6 |
-
from utilities_language_general.esp_constants import SIMILARITY_VALUES_bert
|
7 |
|
8 |
|
9 |
def prepare_target_words(target_words):
|
@@ -111,6 +110,7 @@ def check_token_bert(token, current_minimum: set = None, check_allowed_pos: set
|
|
111 |
def get_distractors_from_model(model, lemma: str, pos: str, gender: str or None, global_distractors: set,
|
112 |
distractor_minimum: set, level_name: str, max_num_distractors: int,
|
113 |
max_length_ratio=5, min_edit_distance_ratio=0.5):
|
|
|
114 |
distractors = []
|
115 |
query = lemma if '_' in lemma else f'{lemma}_{pos}'
|
116 |
lemma = '_'.join(lemma.split('_')[::2])
|
@@ -124,15 +124,16 @@ def get_distractors_from_model(model, lemma: str, pos: str, gender: str or None,
|
|
124 |
query_vector = model.get_mean_vector(query_parts)
|
125 |
candidates = model.similar_by_vector(query_vector, topn=max_num_distractors + 100)
|
126 |
for candidate in candidates:
|
127 |
-
if candidate[0].count('_') == 1:
|
128 |
distractor_lemma, distractor_pos = candidate[0].split('_')
|
129 |
distractor_similarity = candidate[1]
|
130 |
candidate_gender = get_tags(distractor_lemma).get('Gender')
|
131 |
length_ratio = abs(len(lemma) - len(distractor_lemma))
|
132 |
-
condition = ((distractor_pos == pos
|
133 |
-
or (
|
|
|
|
|
134 |
and distractor_lemma != lemma
|
135 |
-
and distractor_similarity < SIMILARITY_VALUES[level_name]
|
136 |
and candidate_gender == gender
|
137 |
and length_ratio <= max_length_ratio
|
138 |
and distractor_lemma not in global_distractors
|
@@ -150,16 +151,18 @@ def get_distractors_from_model(model, lemma: str, pos: str, gender: str or None,
|
|
150 |
if candidate[0].count('_') > 3 or pos in ('NOUN', 'ADJ', 'NUM'):
|
151 |
continue
|
152 |
d1_lemma, d1_pos, d2_lemma, d2_pos = candidate[0].split('_')
|
|
|
153 |
distractor_lemma = f'{d1_lemma}_{d2_lemma}'
|
154 |
distractor_similarity = candidate[1]
|
155 |
condition = (((d1_pos == pos or d2_pos == pos)
|
|
|
|
|
156 |
or (d1_pos in ('VERB', 'AUX', 'SCONJ', 'ADP')
|
157 |
and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP'))
|
158 |
or (d2_pos in ('VERB', 'AUX', 'SCONJ', 'ADP')
|
159 |
and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP')))
|
160 |
and candidate[0] != lemma
|
161 |
and distractor_lemma != lemma
|
162 |
-
and distractor_similarity < SIMILARITY_VALUES[level_name]
|
163 |
and distractor_lemma not in global_distractors)
|
164 |
if condition:
|
165 |
if distractor_minimum is not None:
|
@@ -202,11 +205,11 @@ def get_distractors_from_model_bert(model, text_with_masked_task: str, lemma: st
|
|
202 |
distractor_similarity = candidate_distractor[1]
|
203 |
candidate_gender = get_tags(distractor_lemma).get('Gender')
|
204 |
length_ratio = abs(len(lemma) - len(distractor_lemma))
|
205 |
-
if ((
|
206 |
-
or (pos
|
|
|
207 |
and distractor_lemma != lemma
|
208 |
and (len(_distractors) < max_num_distractors+100)
|
209 |
-
and (distractor_similarity < SIMILARITY_VALUES_bert[level_name])
|
210 |
and (candidate_gender == gender)
|
211 |
and (length_ratio <= max_length_ratio) # May be changed if case of phrases
|
212 |
and (distractor_lemma not in global_distractors)
|
|
|
2 |
from utilities.utils import answer_letter
|
3 |
from utilities_language_general.esp_constants import nlp
|
4 |
from utilities_language_general.esp_constants import FIX_LEMMA
|
5 |
+
from utilities_language_general.esp_constants import COMBINE_POS
|
|
|
6 |
|
7 |
|
8 |
def prepare_target_words(target_words):
|
|
|
110 |
def get_distractors_from_model(model, lemma: str, pos: str, gender: str or None, global_distractors: set,
|
111 |
distractor_minimum: set, level_name: str, max_num_distractors: int,
|
112 |
max_length_ratio=5, min_edit_distance_ratio=0.5):
|
113 |
+
|
114 |
distractors = []
|
115 |
query = lemma if '_' in lemma else f'{lemma}_{pos}'
|
116 |
lemma = '_'.join(lemma.split('_')[::2])
|
|
|
124 |
query_vector = model.get_mean_vector(query_parts)
|
125 |
candidates = model.similar_by_vector(query_vector, topn=max_num_distractors + 100)
|
126 |
for candidate in candidates:
|
127 |
+
if candidate[0].count('_') == 1 and pos != 'phrase':
|
128 |
distractor_lemma, distractor_pos = candidate[0].split('_')
|
129 |
distractor_similarity = candidate[1]
|
130 |
candidate_gender = get_tags(distractor_lemma).get('Gender')
|
131 |
length_ratio = abs(len(lemma) - len(distractor_lemma))
|
132 |
+
condition = ((distractor_pos == pos
|
133 |
+
or (COMBINE_POS['simple'][level_name].get(pos) is not None and COMBINE_POS['simple'][level_name].get(distractor_pos) is not None
|
134 |
+
and distractor_pos in COMBINE_POS['simple'][level_name][pos] and pos in COMBINE_POS['simple'][level_name][distractor_pos])
|
135 |
+
)
|
136 |
and distractor_lemma != lemma
|
|
|
137 |
and candidate_gender == gender
|
138 |
and length_ratio <= max_length_ratio
|
139 |
and distractor_lemma not in global_distractors
|
|
|
151 |
if candidate[0].count('_') > 3 or pos in ('NOUN', 'ADJ', 'NUM'):
|
152 |
continue
|
153 |
d1_lemma, d1_pos, d2_lemma, d2_pos = candidate[0].split('_')
|
154 |
+
d_pos = f'{d1_pos}_{d2_pos}'
|
155 |
distractor_lemma = f'{d1_lemma}_{d2_lemma}'
|
156 |
distractor_similarity = candidate[1]
|
157 |
condition = (((d1_pos == pos or d2_pos == pos)
|
158 |
+
or (COMBINE_POS['phrase'][level_name].get(d_pos) is not None and COMBINE_POS['phrase'][level_name].get(pos) is not None
|
159 |
+
and d_pos in COMBINE_POS['phrase'][level_name].get(d_pos) and pos in COMBINE_POS['phrase'][level_name].get(pos) )
|
160 |
or (d1_pos in ('VERB', 'AUX', 'SCONJ', 'ADP')
|
161 |
and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP'))
|
162 |
or (d2_pos in ('VERB', 'AUX', 'SCONJ', 'ADP')
|
163 |
and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP')))
|
164 |
and candidate[0] != lemma
|
165 |
and distractor_lemma != lemma
|
|
|
166 |
and distractor_lemma not in global_distractors)
|
167 |
if condition:
|
168 |
if distractor_minimum is not None:
|
|
|
205 |
distractor_similarity = candidate_distractor[1]
|
206 |
candidate_gender = get_tags(distractor_lemma).get('Gender')
|
207 |
length_ratio = abs(len(lemma) - len(distractor_lemma))
|
208 |
+
if ((distractor_pos == pos
|
209 |
+
or (COMBINE_POS['simple'][level_name].get(pos) is not None and COMBINE_POS['simple'][level_name].get(distractor_pos) is not None
|
210 |
+
and distractor_pos in COMBINE_POS['simple'][level_name][pos] and pos in COMBINE_POS['simple'][level_name][distractor_pos]))
|
211 |
and distractor_lemma != lemma
|
212 |
and (len(_distractors) < max_num_distractors+100)
|
|
|
213 |
and (candidate_gender == gender)
|
214 |
and (length_ratio <= max_length_ratio) # May be changed if case of phrases
|
215 |
and (distractor_lemma not in global_distractors)
|
utilities_language_w2v/__pycache__/esp_main_workflow_w2v.cpython-310.pyc
CHANGED
Binary files a/utilities_language_w2v/__pycache__/esp_main_workflow_w2v.cpython-310.pyc and b/utilities_language_w2v/__pycache__/esp_main_workflow_w2v.cpython-310.pyc differ
|
|
utilities_language_w2v/__pycache__/esp_sentence_w2v.cpython-310.pyc
CHANGED
Binary files a/utilities_language_w2v/__pycache__/esp_sentence_w2v.cpython-310.pyc and b/utilities_language_w2v/__pycache__/esp_sentence_w2v.cpython-310.pyc differ
|
|
utilities_language_w2v/esp_main_workflow_w2v.py
CHANGED
@@ -11,6 +11,7 @@ from utilities_language_general.esp_constants import load_w2v
|
|
11 |
from utilities_language_general.esp_utils import prepare_tasks
|
12 |
from streamlit.runtime.uploaded_file_manager import UploadedFile
|
13 |
import utilities_language_general.esp_constants as esp_constants
|
|
|
14 |
from utilities_language_general.esp_constants import w2v_model_1_path
|
15 |
from utilities_language_general.esp_constants import w2v_model_2_path
|
16 |
from utilities_language_general.esp_utils import prepare_target_words
|
@@ -146,6 +147,22 @@ def main_workflow(
|
|
146 |
logs.update(label="Запускаем процесс генерации заданий!", state='running')
|
147 |
progress.progress(20)
|
148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
for sentence in workflow:
|
150 |
sentence.lemmatize_sentence()
|
151 |
|
@@ -159,7 +176,8 @@ def main_workflow(
|
|
159 |
target_words_automatic_mode=tw_mode_automatic_mode,
|
160 |
target_minimum=target_minimum,
|
161 |
user_target_words=USER_TARGET_WORDS,
|
162 |
-
frequency_dict=FREQ_DICT
|
|
|
163 |
progress.progress(int(30 + (j * (30 / len(workflow)))))
|
164 |
progress.progress(60)
|
165 |
DUPLICATE_TARGET_WORDS = defaultdict(list)
|
@@ -217,7 +235,12 @@ def main_workflow(
|
|
217 |
NUMBER_TASKS = 10
|
218 |
else:
|
219 |
NUMBER_TASKS = len(RESULT_TASKS)
|
220 |
-
|
|
|
|
|
|
|
|
|
|
|
221 |
RESULT_TASKS = sorted(RESULT_TASKS, key=lambda t: (t.sentence_number, t.position_in_sentence))
|
222 |
|
223 |
for task in RESULT_TASKS:
|
|
|
11 |
from utilities_language_general.esp_utils import prepare_tasks
|
12 |
from streamlit.runtime.uploaded_file_manager import UploadedFile
|
13 |
import utilities_language_general.esp_constants as esp_constants
|
14 |
+
from utilities_language_general.esp_constants import summarization
|
15 |
from utilities_language_general.esp_constants import w2v_model_1_path
|
16 |
from utilities_language_general.esp_constants import w2v_model_2_path
|
17 |
from utilities_language_general.esp_utils import prepare_target_words
|
|
|
147 |
logs.update(label="Запускаем процесс генерации заданий!", state='running')
|
148 |
progress.progress(20)
|
149 |
|
150 |
+
# Define summary length
|
151 |
+
text_length = len(current_text_sentences)
|
152 |
+
if text_length <= 15:
|
153 |
+
summary_length = text_length
|
154 |
+
elif text_length <= 25:
|
155 |
+
summary_length = 15
|
156 |
+
else:
|
157 |
+
n = (text_length - 20) // 5
|
158 |
+
summary_length = 15 + 2 * n
|
159 |
+
round_summary_length = summary_length - (summary_length % - 10)
|
160 |
+
|
161 |
+
# Get summary. May choose between round_summary_length and summary_length
|
162 |
+
SUMMARY = summarization(current_text, num_sentences=round_summary_length)
|
163 |
+
logs.success('Нашли интересные предложения. Пригодятся!')
|
164 |
+
progress.progress(25)
|
165 |
+
|
166 |
for sentence in workflow:
|
167 |
sentence.lemmatize_sentence()
|
168 |
|
|
|
176 |
target_words_automatic_mode=tw_mode_automatic_mode,
|
177 |
target_minimum=target_minimum,
|
178 |
user_target_words=USER_TARGET_WORDS,
|
179 |
+
frequency_dict=FREQ_DICT,
|
180 |
+
summary=SUMMARY)
|
181 |
progress.progress(int(30 + (j * (30 / len(workflow)))))
|
182 |
progress.progress(60)
|
183 |
DUPLICATE_TARGET_WORDS = defaultdict(list)
|
|
|
235 |
NUMBER_TASKS = 10
|
236 |
else:
|
237 |
NUMBER_TASKS = len(RESULT_TASKS)
|
238 |
+
RESULT_TASKS_in_summary = filter(lambda task: task.in_summary, RESULT_TASKS)
|
239 |
+
RESULT_TASTS_not_in_summary = filter(lambda task: not task.in_summary, RESULT_TASKS)
|
240 |
+
if len(RESULT_TASKS_in_summary) >= NUMBER_TASKS:
|
241 |
+
RESULT_TASKS = RESULT_TASKS_in_summary
|
242 |
+
else:
|
243 |
+
RESULT_TASKS = RESULT_TASKS_in_summary + sample(RESULT_TASTS_not_in_summary, NUMBER_TASKS - len(RESULT_TASKS_in_summary))
|
244 |
RESULT_TASKS = sorted(RESULT_TASKS, key=lambda t: (t.sentence_number, t.position_in_sentence))
|
245 |
|
246 |
for task in RESULT_TASKS:
|
utilities_language_w2v/esp_sentence_w2v.py
CHANGED
@@ -47,7 +47,7 @@ class SENTENCE:
|
|
47 |
self.sentence_phrases.append(self.sentence_lemma_pos[i][1])
|
48 |
previous_was_phrase = False
|
49 |
|
50 |
-
def search_target_words_automatically(self, model, target_minimum: set, frequency_dict: dict = None):
|
51 |
for token in self.sentence_phrases:
|
52 |
if isinstance(token, list): # if token is a phrase
|
53 |
original_token1 = token[1]['original_token1']
|
@@ -76,7 +76,8 @@ class SENTENCE:
|
|
76 |
'tags': tags,
|
77 |
'position_in_sentence': self.original.find(original_token1.text),
|
78 |
'not_named_entity': not_ner,
|
79 |
-
'frequency_in_text': 0
|
|
|
80 |
}
|
81 |
self.target_words.append(target_word)
|
82 |
else: # if token is just a spacy.nlp token
|
@@ -94,10 +95,11 @@ class SENTENCE:
|
|
94 |
'position_in_sentence': self.original.find(token.text),
|
95 |
'not_named_entity': True if token.ent_type == 0 else False,
|
96 |
'frequency_in_text': frequency_dict.get(token.lemma_, 1),
|
|
|
97 |
}
|
98 |
self.target_words.append(target_word)
|
99 |
|
100 |
-
def search_user_target_words(self, model, user_target_words: set = None, frequency_dict: dict = None):
|
101 |
for _utw in user_target_words:
|
102 |
if _utw in self.original:
|
103 |
parse_utw = nlp(_utw)
|
@@ -132,7 +134,8 @@ class SENTENCE:
|
|
132 |
'tags': user_target_word_tags,
|
133 |
'position_in_sentence': self.original.find(_utw),
|
134 |
'not_named_entity': not_ner,
|
135 |
-
'frequency_in_text': frequency_dict.get(user_target_word_lemma, 1)
|
|
|
136 |
}
|
137 |
if not (model.has_index_for(user_target_word_lemma)
|
138 |
or model.has_index_for(f'{user_target_word_lemma}_{user_target_word_pos}')):
|
@@ -140,14 +143,14 @@ class SENTENCE:
|
|
140 |
else:
|
141 |
self.target_words.append(target_word)
|
142 |
|
143 |
-
def search_target_words(self, model, target_words_automatic_mode: bool, target_minimum,
|
144 |
user_target_words: set = None, frequency_dict: dict = None):
|
145 |
if target_words_automatic_mode:
|
146 |
self.search_target_words_automatically(model=model, target_minimum=target_minimum,
|
147 |
-
frequency_dict=frequency_dict)
|
148 |
else:
|
149 |
self.search_user_target_words(model=model, user_target_words=user_target_words,
|
150 |
-
frequency_dict=frequency_dict)
|
151 |
|
152 |
def attach_distractors_to_target_word(self, model, global_distractors, distractor_minimum, level_name,
|
153 |
max_frequency, logs, progress):
|
|
|
47 |
self.sentence_phrases.append(self.sentence_lemma_pos[i][1])
|
48 |
previous_was_phrase = False
|
49 |
|
50 |
+
def search_target_words_automatically(self, model, target_minimum: set, frequency_dict: dict = None, summary:list=None):
|
51 |
for token in self.sentence_phrases:
|
52 |
if isinstance(token, list): # if token is a phrase
|
53 |
original_token1 = token[1]['original_token1']
|
|
|
76 |
'tags': tags,
|
77 |
'position_in_sentence': self.original.find(original_token1.text),
|
78 |
'not_named_entity': not_ner,
|
79 |
+
'frequency_in_text': 0,
|
80 |
+
'in_summary': self.original in summary
|
81 |
}
|
82 |
self.target_words.append(target_word)
|
83 |
else: # if token is just a spacy.nlp token
|
|
|
95 |
'position_in_sentence': self.original.find(token.text),
|
96 |
'not_named_entity': True if token.ent_type == 0 else False,
|
97 |
'frequency_in_text': frequency_dict.get(token.lemma_, 1),
|
98 |
+
'in_summary': self.original in summary
|
99 |
}
|
100 |
self.target_words.append(target_word)
|
101 |
|
102 |
+
def search_user_target_words(self, model, user_target_words: set = None, frequency_dict: dict = None, summary:list=None):
|
103 |
for _utw in user_target_words:
|
104 |
if _utw in self.original:
|
105 |
parse_utw = nlp(_utw)
|
|
|
134 |
'tags': user_target_word_tags,
|
135 |
'position_in_sentence': self.original.find(_utw),
|
136 |
'not_named_entity': not_ner,
|
137 |
+
'frequency_in_text': frequency_dict.get(user_target_word_lemma, 1),
|
138 |
+
'in_summary': self.original in summary
|
139 |
}
|
140 |
if not (model.has_index_for(user_target_word_lemma)
|
141 |
or model.has_index_for(f'{user_target_word_lemma}_{user_target_word_pos}')):
|
|
|
143 |
else:
|
144 |
self.target_words.append(target_word)
|
145 |
|
146 |
+
def search_target_words(self, model, target_words_automatic_mode: bool, target_minimum, summary:list=None,
|
147 |
user_target_words: set = None, frequency_dict: dict = None):
|
148 |
if target_words_automatic_mode:
|
149 |
self.search_target_words_automatically(model=model, target_minimum=target_minimum,
|
150 |
+
frequency_dict=frequency_dict, summary=summary)
|
151 |
else:
|
152 |
self.search_user_target_words(model=model, user_target_words=user_target_words,
|
153 |
+
frequency_dict=frequency_dict, summary=summary)
|
154 |
|
155 |
def attach_distractors_to_target_word(self, model, global_distractors, distractor_minimum, level_name,
|
156 |
max_frequency, logs, progress):
|
utilities_option_menu/__pycache__/option_menu.cpython-310.pyc
CHANGED
Binary files a/utilities_option_menu/__pycache__/option_menu.cpython-310.pyc and b/utilities_option_menu/__pycache__/option_menu.cpython-310.pyc differ
|
|
utilities_ui/__pycache__/custom_download_button.cpython-310.pyc
CHANGED
Binary files a/utilities_ui/__pycache__/custom_download_button.cpython-310.pyc and b/utilities_ui/__pycache__/custom_download_button.cpython-310.pyc differ
|
|