Spaces:
Running
Running
a-v-bely
commited on
Commit
·
41e198b
1
Parent(s):
08d25e9
Update backend
Browse files- utilities_database/user_database_widgets.py +2 -2
- utilities_language_bert/esp_main_workflow_bert.py +31 -34
- utilities_language_bert/esp_sentence_bert.py +4 -4
- utilities_language_general/esp_constants.py +7 -3
- utilities_language_general/esp_utils.py +4 -6
- utilities_language_w2v/esp_main_workflow_w2v.py +24 -27
- utilities_language_w2v/esp_sentence_w2v.py +11 -10
- utilities_ui/custom_download_button.py +1 -1
utilities_database/user_database_widgets.py
CHANGED
@@ -123,7 +123,7 @@ class LogIn:
|
|
123 |
email_sign_up=email_sign_up)
|
124 |
|
125 |
user_name_sign_up = st.text_input("Имя пользователя *",
|
126 |
-
placeholder='Введите имя пользователя')
|
127 |
unique_user_name_check = db_utils.check_unique_usr(user_log_in_database=db,
|
128 |
user_name_sign_up=user_name_sign_up)
|
129 |
|
@@ -206,7 +206,7 @@ class LogIn:
|
|
206 |
new_passwd = st.text_input("Новый пароль", placeholder='Введите новый пароль',
|
207 |
type='password')
|
208 |
|
209 |
-
new_passwd_1 = st.text_input("Повторите новый пароль", placeholder='
|
210 |
type='password')
|
211 |
|
212 |
reset_passwd_submit_button = st.form_submit_button(label='Изменить пароль')
|
|
|
123 |
email_sign_up=email_sign_up)
|
124 |
|
125 |
user_name_sign_up = st.text_input("Имя пользователя *",
|
126 |
+
placeholder='Введите имя пользователя (латинские буквы и символы)')
|
127 |
unique_user_name_check = db_utils.check_unique_usr(user_log_in_database=db,
|
128 |
user_name_sign_up=user_name_sign_up)
|
129 |
|
|
|
206 |
new_passwd = st.text_input("Новый пароль", placeholder='Введите новый пароль',
|
207 |
type='password')
|
208 |
|
209 |
+
new_passwd_1 = st.text_input("Повторите новый пароль", placeholder='Повторите пароль',
|
210 |
type='password')
|
211 |
|
212 |
reset_passwd_submit_button = st.form_submit_button(label='Изменить пароль')
|
utilities_language_bert/esp_main_workflow_bert.py
CHANGED
@@ -20,9 +20,8 @@ def main_workflow(
|
|
20 |
file: UploadedFile or None,
|
21 |
text: str,
|
22 |
logs: ST_WIDGETS,
|
23 |
-
logs_d: ST_WIDGETS,
|
24 |
progress: st_progress,
|
25 |
-
|
26 |
level: str,
|
27 |
tw_mode_automatic_mode: str,
|
28 |
target_words: str,
|
@@ -37,26 +36,26 @@ def main_workflow(
|
|
37 |
:param file: user's file to generate tasks in
|
38 |
:param text: user's text input to generate tasks in
|
39 |
:param logs: widget to output logs to
|
40 |
-
:param logs_d: show how many distractors already processed
|
41 |
:param progress: progress bar
|
42 |
-
:param
|
43 |
:param target_words: how target words are chosen: by user or automatically
|
44 |
:param tw_mode_automatic_mode:
|
45 |
:param level: user's specification of CEFR level of text
|
46 |
:param num_distractors: how many distractors does the user want the task to contain
|
47 |
:param save_name: user specifies name to save file in cloud
|
48 |
-
:param global_bad_target_words:
|
49 |
:return: Dictionary with output data: filename, amount_mode, text_with_gaps, tasks_as_list, correct_answers,
|
50 |
student_out, teacher_out, total_out, original_text
|
51 |
"""
|
52 |
# Clear bad target_words each time
|
53 |
-
global_bad_target_words
|
|
|
54 |
|
55 |
# Define main global variables
|
56 |
-
logs.write()
|
57 |
GLOBAL_DISTRACTORS = set()
|
58 |
MAX_FREQUENCY = 0
|
59 |
|
|
|
60 |
mask_filler = load_bert()
|
61 |
|
62 |
# Get input text
|
@@ -84,11 +83,12 @@ def main_workflow(
|
|
84 |
|
85 |
# Text preprocessing
|
86 |
original_text = current_text
|
87 |
-
current_text = current_text.replace('.', '. ').replace('. . .', '...')
|
88 |
-
|
89 |
-
|
|
|
90 |
current_text_sentences = [sent.text.strip() for sent in esp_constants.nlp(current_text).sents]
|
91 |
-
logs.
|
92 |
progress.progress(10)
|
93 |
|
94 |
# Compute frequency dict
|
@@ -100,7 +100,7 @@ def main_workflow(
|
|
100 |
if j < len(FREQ_DICT) * _frequency_barrier_percent:
|
101 |
MAX_FREQUENCY = tp[1]
|
102 |
MAX_FREQUENCY = 3 if MAX_FREQUENCY < 3 else MAX_FREQUENCY
|
103 |
-
logs.
|
104 |
progress.progress(15)
|
105 |
|
106 |
# Choose necessary language minimum according to user's input
|
@@ -134,7 +134,7 @@ def main_workflow(
|
|
134 |
# Start generation process
|
135 |
workflow = [SENTENCE(original=sent.strip(), n_sentence=num, max_num_distractors=num_distractors)
|
136 |
for num, sent in enumerate(current_text_sentences)]
|
137 |
-
logs.
|
138 |
progress.progress(20)
|
139 |
|
140 |
for sentence in workflow:
|
@@ -142,7 +142,7 @@ def main_workflow(
|
|
142 |
|
143 |
for sentence in workflow:
|
144 |
sentence.bind_phrases()
|
145 |
-
logs.
|
146 |
progress.progress(30)
|
147 |
|
148 |
for j, sentence in enumerate(workflow):
|
@@ -151,7 +151,7 @@ def main_workflow(
|
|
151 |
user_target_words=USER_TARGET_WORDS,
|
152 |
frequency_dict=FREQ_DICT)
|
153 |
progress.progress(int(30 + (j * (20 / len(workflow)))))
|
154 |
-
|
155 |
DUPLICATE_TARGET_WORDS = defaultdict(list)
|
156 |
for sentence in workflow:
|
157 |
for target_word in sentence.target_words:
|
@@ -164,8 +164,8 @@ def main_workflow(
|
|
164 |
if target_word not in RESULT_TW:
|
165 |
global_bad_target_words.append(target_word['original_text'])
|
166 |
sentence.target_words.remove(target_word)
|
167 |
-
|
168 |
-
logs.
|
169 |
|
170 |
for sentence in workflow:
|
171 |
for i, target_word in enumerate(sentence.target_words):
|
@@ -176,7 +176,7 @@ def main_workflow(
|
|
176 |
|
177 |
for sentence in workflow:
|
178 |
sentence.filter_target_words(target_words_automatic_mode=tw_mode_automatic_mode)
|
179 |
-
|
180 |
|
181 |
RESULT_TASKS = []
|
182 |
for sentence in workflow:
|
@@ -189,21 +189,21 @@ def main_workflow(
|
|
189 |
global_distractors=GLOBAL_DISTRACTORS,
|
190 |
distractor_minimum=distractor_minimum,
|
191 |
max_frequency=MAX_FREQUENCY)
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
logs.
|
198 |
|
199 |
for task in RESULT_TASKS:
|
200 |
task.inflect_distractors()
|
201 |
-
|
202 |
-
logs.
|
203 |
|
204 |
for task in RESULT_TASKS:
|
205 |
task.sample_distractors(num_distractors=num_distractors)
|
206 |
-
|
207 |
RESULT_TASKS = list(filter(lambda t: not t.bad_target_word, RESULT_TASKS))
|
208 |
|
209 |
for task in RESULT_TASKS[::-1]:
|
@@ -226,8 +226,8 @@ def main_workflow(
|
|
226 |
|
227 |
for task in RESULT_TASKS:
|
228 |
task.compile_task(max_num_distractors=num_distractors)
|
229 |
-
|
230 |
-
logs.
|
231 |
|
232 |
TEXT_WITH_GAPS = []
|
233 |
VARIANTS = []
|
@@ -241,9 +241,6 @@ def main_workflow(
|
|
241 |
TEXT_WITH_GAPS.append(sentence)
|
242 |
del RESULT_TASKS
|
243 |
|
244 |
-
logs.success('Сейчас все будет готово!')
|
245 |
-
progress_s.progress(90)
|
246 |
-
|
247 |
TEXT_WITH_GAPS = ' '.join([sentence for sentence in TEXT_WITH_GAPS]).replace('%^&*', '\n')
|
248 |
PREPARED_TASKS = prepare_tasks(VARIANTS)
|
249 |
STUDENT_OUT = f'{TEXT_WITH_GAPS}\n\n{"=" * 70}\n\n{PREPARED_TASKS["TASKS_STUDENT"]}'
|
@@ -251,8 +248,8 @@ def main_workflow(
|
|
251 |
f'{PREPARED_TASKS["KEYS_ONLY"]}'
|
252 |
TOTAL_OUT = f'{original_text}\n\n{"$" * 70}\n\n{STUDENT_OUT}\n\n{"=" * 70}\n\n{PREPARED_TASKS["TASKS_TEACHER"]}' \
|
253 |
f'\n\n{"$" * 70}\n\n{PREPARED_TASKS["KEYS_ONLY"]}'
|
254 |
-
logs.
|
255 |
-
|
256 |
save_name = save_name if save_name != '' else f'{str(datetime.datetime.now())[:-7]}_{original_text[:20]}'
|
257 |
out = {
|
258 |
'name': save_name,
|
|
|
20 |
file: UploadedFile or None,
|
21 |
text: str,
|
22 |
logs: ST_WIDGETS,
|
|
|
23 |
progress: st_progress,
|
24 |
+
progress_d: st_progress,
|
25 |
level: str,
|
26 |
tw_mode_automatic_mode: str,
|
27 |
target_words: str,
|
|
|
36 |
:param file: user's file to generate tasks in
|
37 |
:param text: user's text input to generate tasks in
|
38 |
:param logs: widget to output logs to
|
|
|
39 |
:param progress: progress bar
|
40 |
+
:param progress_d: sentences progress bar
|
41 |
:param target_words: how target words are chosen: by user or automatically
|
42 |
:param tw_mode_automatic_mode:
|
43 |
:param level: user's specification of CEFR level of text
|
44 |
:param num_distractors: how many distractors does the user want the task to contain
|
45 |
:param save_name: user specifies name to save file in cloud
|
46 |
+
:param global_bad_target_words: global bad target words
|
47 |
:return: Dictionary with output data: filename, amount_mode, text_with_gaps, tasks_as_list, correct_answers,
|
48 |
student_out, teacher_out, total_out, original_text
|
49 |
"""
|
50 |
# Clear bad target_words each time
|
51 |
+
if global_bad_target_words:
|
52 |
+
global_bad_target_words = []
|
53 |
|
54 |
# Define main global variables
|
|
|
55 |
GLOBAL_DISTRACTORS = set()
|
56 |
MAX_FREQUENCY = 0
|
57 |
|
58 |
+
logs.update(label='Загружаем языковые модели и другие данные', state='running')
|
59 |
mask_filler = load_bert()
|
60 |
|
61 |
# Get input text
|
|
|
83 |
|
84 |
# Text preprocessing
|
85 |
original_text = current_text
|
86 |
+
current_text = (current_text.replace('.', '. ').replace('. . .', '...')
|
87 |
+
.replace(' ', ' ').replace('…', '...').replace('…', '...')
|
88 |
+
.replace('—', '-').replace('\u2014', '-').replace('—', '-')
|
89 |
+
.replace('-\n', '').replace('\n', '%^&*'))
|
90 |
current_text_sentences = [sent.text.strip() for sent in esp_constants.nlp(current_text).sents]
|
91 |
+
logs.update(label='Получили Ваш текст!', state='running')
|
92 |
progress.progress(10)
|
93 |
|
94 |
# Compute frequency dict
|
|
|
100 |
if j < len(FREQ_DICT) * _frequency_barrier_percent:
|
101 |
MAX_FREQUENCY = tp[1]
|
102 |
MAX_FREQUENCY = 3 if MAX_FREQUENCY < 3 else MAX_FREQUENCY
|
103 |
+
logs.update(label="Посчитали немного статистики!", state='running')
|
104 |
progress.progress(15)
|
105 |
|
106 |
# Choose necessary language minimum according to user's input
|
|
|
134 |
# Start generation process
|
135 |
workflow = [SENTENCE(original=sent.strip(), n_sentence=num, max_num_distractors=num_distractors)
|
136 |
for num, sent in enumerate(current_text_sentences)]
|
137 |
+
logs.update(label="Запускаем процесс генерации заданий!", state='running')
|
138 |
progress.progress(20)
|
139 |
|
140 |
for sentence in workflow:
|
|
|
142 |
|
143 |
for sentence in workflow:
|
144 |
sentence.bind_phrases()
|
145 |
+
logs.update(label="Подготовили предложения для дальнейшей работы!", state='running')
|
146 |
progress.progress(30)
|
147 |
|
148 |
for j, sentence in enumerate(workflow):
|
|
|
151 |
user_target_words=USER_TARGET_WORDS,
|
152 |
frequency_dict=FREQ_DICT)
|
153 |
progress.progress(int(30 + (j * (20 / len(workflow)))))
|
154 |
+
progress.progress(50)
|
155 |
DUPLICATE_TARGET_WORDS = defaultdict(list)
|
156 |
for sentence in workflow:
|
157 |
for target_word in sentence.target_words:
|
|
|
164 |
if target_word not in RESULT_TW:
|
165 |
global_bad_target_words.append(target_word['original_text'])
|
166 |
sentence.target_words.remove(target_word)
|
167 |
+
progress.progress(55)
|
168 |
+
logs.update(label='Выбрали слова-пропуски!', state='running')
|
169 |
|
170 |
for sentence in workflow:
|
171 |
for i, target_word in enumerate(sentence.target_words):
|
|
|
176 |
|
177 |
for sentence in workflow:
|
178 |
sentence.filter_target_words(target_words_automatic_mode=tw_mode_automatic_mode)
|
179 |
+
progress.progress(60)
|
180 |
|
181 |
RESULT_TASKS = []
|
182 |
for sentence in workflow:
|
|
|
189 |
global_distractors=GLOBAL_DISTRACTORS,
|
190 |
distractor_minimum=distractor_minimum,
|
191 |
max_frequency=MAX_FREQUENCY)
|
192 |
+
progress_d.progress(num / len(RESULT_TASKS))
|
193 |
+
logs.update(label=f'Обработали {num}/{len(RESULT_TASKS)} целевых слов!', state='running')
|
194 |
+
logs.update(label=f'Обработали {len(RESULT_TASKS)}/{len(RESULT_TASKS)} целевых слов!', state='running')
|
195 |
+
progress_d.progress(100)
|
196 |
+
progress.progress(70)
|
197 |
+
logs.update(label='Подобрали неправильные варианты!', state='running')
|
198 |
|
199 |
for task in RESULT_TASKS:
|
200 |
task.inflect_distractors()
|
201 |
+
progress.progress(80)
|
202 |
+
logs.update(label='Просклоняли и проспрягали неправильные варианты!', state='running')
|
203 |
|
204 |
for task in RESULT_TASKS:
|
205 |
task.sample_distractors(num_distractors=num_distractors)
|
206 |
+
progress.progress(85)
|
207 |
RESULT_TASKS = list(filter(lambda t: not t.bad_target_word, RESULT_TASKS))
|
208 |
|
209 |
for task in RESULT_TASKS[::-1]:
|
|
|
226 |
|
227 |
for task in RESULT_TASKS:
|
228 |
task.compile_task(max_num_distractors=num_distractors)
|
229 |
+
progress.progress(90)
|
230 |
+
logs.update(label='Отобрали лучшие задания!', state='running')
|
231 |
|
232 |
TEXT_WITH_GAPS = []
|
233 |
VARIANTS = []
|
|
|
241 |
TEXT_WITH_GAPS.append(sentence)
|
242 |
del RESULT_TASKS
|
243 |
|
|
|
|
|
|
|
244 |
TEXT_WITH_GAPS = ' '.join([sentence for sentence in TEXT_WITH_GAPS]).replace('%^&*', '\n')
|
245 |
PREPARED_TASKS = prepare_tasks(VARIANTS)
|
246 |
STUDENT_OUT = f'{TEXT_WITH_GAPS}\n\n{"=" * 70}\n\n{PREPARED_TASKS["TASKS_STUDENT"]}'
|
|
|
248 |
f'{PREPARED_TASKS["KEYS_ONLY"]}'
|
249 |
TOTAL_OUT = f'{original_text}\n\n{"$" * 70}\n\n{STUDENT_OUT}\n\n{"=" * 70}\n\n{PREPARED_TASKS["TASKS_TEACHER"]}' \
|
250 |
f'\n\n{"$" * 70}\n\n{PREPARED_TASKS["KEYS_ONLY"]}'
|
251 |
+
logs.update(label='Сейчас все будет готово!', state='running')
|
252 |
+
progress.progress(95)
|
253 |
save_name = save_name if save_name != '' else f'{str(datetime.datetime.now())[:-7]}_{original_text[:20]}'
|
254 |
out = {
|
255 |
'name': save_name,
|
utilities_language_bert/esp_sentence_bert.py
CHANGED
@@ -208,7 +208,7 @@ class TASK:
|
|
208 |
self.bad_target_word = True
|
209 |
self.distractors = None
|
210 |
else:
|
211 |
-
self.distractors = [d[0] for i, d in enumerate(distractors_sentence) if i <
|
212 |
self.distractors_number = len(distractors_sentence) if distractors_sentence is not None else 0
|
213 |
|
214 |
def inflect_distractors(self):
|
@@ -238,8 +238,7 @@ class TASK:
|
|
238 |
def sample_distractors(self, num_distractors):
|
239 |
if not self.bad_target_word:
|
240 |
num_distractors = min(self.distractors_number, num_distractors) if num_distractors >= 4 else num_distractors
|
241 |
-
self.inflected_distractors = sample(self.inflected_distractors
|
242 |
-
num_distractors)
|
243 |
|
244 |
def compile_task(self, max_num_distractors):
|
245 |
len_distractors = len(self.inflected_distractors)
|
@@ -248,7 +247,8 @@ class TASK:
|
|
248 |
letters = (f'({letter})' for letter in string.ascii_lowercase[:len_variants + 1])
|
249 |
try:
|
250 |
distractors = sample(self.inflected_distractors, len_variants) + [self.original_text, ]
|
251 |
-
except ValueError:
|
|
|
252 |
distractors = self.inflected_distractors + [self.original_text, ]
|
253 |
tmp_vars = [f'{item[0]} {item[1].replace("_", " ")}'.lower()
|
254 |
for item in zip(letters, sorted(distractors, key=lambda _: random()))]
|
|
|
208 |
self.bad_target_word = True
|
209 |
self.distractors = None
|
210 |
else:
|
211 |
+
self.distractors = [d[0] for i, d in enumerate(distractors_sentence) if i < 30]
|
212 |
self.distractors_number = len(distractors_sentence) if distractors_sentence is not None else 0
|
213 |
|
214 |
def inflect_distractors(self):
|
|
|
238 |
def sample_distractors(self, num_distractors):
|
239 |
if not self.bad_target_word:
|
240 |
num_distractors = min(self.distractors_number, num_distractors) if num_distractors >= 4 else num_distractors
|
241 |
+
self.inflected_distractors = sample(self.inflected_distractors, num_distractors)
|
|
|
242 |
|
243 |
def compile_task(self, max_num_distractors):
|
244 |
len_distractors = len(self.inflected_distractors)
|
|
|
247 |
letters = (f'({letter})' for letter in string.ascii_lowercase[:len_variants + 1])
|
248 |
try:
|
249 |
distractors = sample(self.inflected_distractors, len_variants) + [self.original_text, ]
|
250 |
+
except ValueError as e:
|
251 |
+
print(f'{e}\n{len_distractors=}\n{len_variants=}')
|
252 |
distractors = self.inflected_distractors + [self.original_text, ]
|
253 |
tmp_vars = [f'{item[0]} {item[1].replace("_", " ")}'.lower()
|
254 |
for item in zip(letters, sorted(distractors, key=lambda _: random()))]
|
utilities_language_general/esp_constants.py
CHANGED
@@ -7,19 +7,23 @@ from transformers import pipeline
|
|
7 |
|
8 |
@st.cache_resource
|
9 |
def load_w2v(model_path):
|
10 |
-
|
|
|
11 |
return _w2v_model
|
12 |
|
13 |
|
14 |
@st.cache_resource
|
15 |
def load_spacy():
|
16 |
-
|
|
|
17 |
return _nlp
|
18 |
|
19 |
|
20 |
@st.cache_resource
|
21 |
def load_bert():
|
22 |
-
|
|
|
|
|
23 |
|
24 |
|
25 |
nlp = load_spacy()
|
|
|
7 |
|
8 |
@st.cache_resource
|
9 |
def load_w2v(model_path):
|
10 |
+
with st.spinner('Загружаю языковую модель'):
|
11 |
+
_w2v_model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)
|
12 |
return _w2v_model
|
13 |
|
14 |
|
15 |
@st.cache_resource
|
16 |
def load_spacy():
|
17 |
+
with st.spinner('Загружаю морфо-синтаксический парсер'):
|
18 |
+
_nlp = spacy.load('es_core_news_lg')
|
19 |
return _nlp
|
20 |
|
21 |
|
22 |
@st.cache_resource
|
23 |
def load_bert():
|
24 |
+
with st.spinner('Загружаю языковую модель'):
|
25 |
+
_pipeline = pipeline(task="fill-mask", model="a-v-white/bert-base-spanish-wwm-cased-finetuned-literature-pro")
|
26 |
+
return _pipeline
|
27 |
|
28 |
|
29 |
nlp = load_spacy()
|
utilities_language_general/esp_utils.py
CHANGED
@@ -132,7 +132,6 @@ def get_distractors_from_model(model, lemma: str, pos: str, gender: str or None,
|
|
132 |
condition = ((distractor_pos == pos
|
133 |
or (distractor_pos in ('VERB', 'ADJ', 'phrase') and pos in ('VERB', 'ADJ', 'phrase')))
|
134 |
and distractor_lemma != lemma
|
135 |
-
and len(distractors) < 100
|
136 |
and distractor_similarity < SIMILARITY_VALUES[level_name]
|
137 |
and candidate_gender == gender
|
138 |
and length_ratio <= max_length_ratio
|
@@ -160,7 +159,6 @@ def get_distractors_from_model(model, lemma: str, pos: str, gender: str or None,
|
|
160 |
and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP')))
|
161 |
and candidate[0] != lemma
|
162 |
and distractor_lemma != lemma
|
163 |
-
and len(distractors) < 100
|
164 |
and distractor_similarity < SIMILARITY_VALUES[level_name]
|
165 |
and distractor_lemma not in global_distractors)
|
166 |
if condition:
|
@@ -173,10 +171,10 @@ def get_distractors_from_model(model, lemma: str, pos: str, gender: str or None,
|
|
173 |
distractors.append((candidate[0], distractor_similarity))
|
174 |
global_distractors.add(distractor_lemma)
|
175 |
max_num_distractors = min(4, max_num_distractors) if max_num_distractors >= 4 else max_num_distractors
|
176 |
-
if len(distractors)
|
177 |
-
return distractors
|
178 |
-
else:
|
179 |
return None
|
|
|
|
|
180 |
|
181 |
|
182 |
def get_distractors_from_model_bert(model, text_with_masked_task: str, lemma: str, pos: str, gender: str or None,
|
@@ -207,7 +205,7 @@ def get_distractors_from_model_bert(model, text_with_masked_task: str, lemma: st
|
|
207 |
if (((distractor_pos == pos)
|
208 |
or (pos in ('VERB', 'ADJ', 'phrase') and distractor_pos in ('VERB', 'ADJ', 'phrase')))
|
209 |
and distractor_lemma != lemma
|
210 |
-
and (len(_distractors) < max_num_distractors+
|
211 |
and (distractor_similarity < SIMILARITY_VALUES_bert[level_name])
|
212 |
and (candidate_gender == gender)
|
213 |
and (length_ratio <= max_length_ratio) # May be changed if case of phrases
|
|
|
132 |
condition = ((distractor_pos == pos
|
133 |
or (distractor_pos in ('VERB', 'ADJ', 'phrase') and pos in ('VERB', 'ADJ', 'phrase')))
|
134 |
and distractor_lemma != lemma
|
|
|
135 |
and distractor_similarity < SIMILARITY_VALUES[level_name]
|
136 |
and candidate_gender == gender
|
137 |
and length_ratio <= max_length_ratio
|
|
|
159 |
and pos in ('phrase', 'VERB', 'AUX', 'SCONJ', 'ADP')))
|
160 |
and candidate[0] != lemma
|
161 |
and distractor_lemma != lemma
|
|
|
162 |
and distractor_similarity < SIMILARITY_VALUES[level_name]
|
163 |
and distractor_lemma not in global_distractors)
|
164 |
if condition:
|
|
|
171 |
distractors.append((candidate[0], distractor_similarity))
|
172 |
global_distractors.add(distractor_lemma)
|
173 |
max_num_distractors = min(4, max_num_distractors) if max_num_distractors >= 4 else max_num_distractors
|
174 |
+
if len(distractors) < max_num_distractors:
|
|
|
|
|
175 |
return None
|
176 |
+
else:
|
177 |
+
return distractors
|
178 |
|
179 |
|
180 |
def get_distractors_from_model_bert(model, text_with_masked_task: str, lemma: str, pos: str, gender: str or None,
|
|
|
205 |
if (((distractor_pos == pos)
|
206 |
or (pos in ('VERB', 'ADJ', 'phrase') and distractor_pos in ('VERB', 'ADJ', 'phrase')))
|
207 |
and distractor_lemma != lemma
|
208 |
+
and (len(_distractors) < max_num_distractors+100)
|
209 |
and (distractor_similarity < SIMILARITY_VALUES_bert[level_name])
|
210 |
and (candidate_gender == gender)
|
211 |
and (length_ratio <= max_length_ratio) # May be changed if case of phrases
|
utilities_language_w2v/esp_main_workflow_w2v.py
CHANGED
@@ -22,10 +22,8 @@ def main_workflow(
|
|
22 |
file: UploadedFile or None,
|
23 |
text: str,
|
24 |
logs: ST_WIDGETS,
|
25 |
-
logs_d: ST_WIDGETS,
|
26 |
progress: st_progress,
|
27 |
progress_d: st_progress,
|
28 |
-
progress_s: st_progress,
|
29 |
level: str,
|
30 |
tw_mode_automatic_mode: str,
|
31 |
target_words: str,
|
@@ -41,10 +39,8 @@ def main_workflow(
|
|
41 |
:param file: user's file to generate tasks in
|
42 |
:param text: user's text input to generate tasks in
|
43 |
:param logs: widget to output logs to
|
44 |
-
:param logs_d: show how many distractors already processed
|
45 |
:param progress: progress bar
|
46 |
:param progress_d: distractors progress bar
|
47 |
-
:param progress_s: sentences progress bar
|
48 |
:param target_words: how target words are chosen: by user or automatically
|
49 |
:param tw_mode_automatic_mode:
|
50 |
:param level: user's specification of CEFR level of text
|
@@ -55,11 +51,12 @@ def main_workflow(
|
|
55 |
:return: Dictionary with output data: filename, amount_mode, text_with_gaps, tasks_as_list, correct_answers,
|
56 |
student_out, teacher_out, total_out, original_text
|
57 |
"""
|
|
|
58 |
# Clear bad target_words each time
|
59 |
-
global_bad_target_words
|
|
|
60 |
|
61 |
# Define main global variables
|
62 |
-
logs.write()
|
63 |
GLOBAL_DISTRACTORS = set()
|
64 |
MAX_FREQUENCY = 0
|
65 |
|
@@ -88,11 +85,12 @@ def main_workflow(
|
|
88 |
|
89 |
# Text preprocessing
|
90 |
original_text = current_text
|
91 |
-
current_text = current_text.replace('.', '. ').replace('. . .', '...')
|
92 |
-
|
93 |
-
|
|
|
94 |
current_text_sentences = [sent.text.strip() for sent in esp_constants.nlp(current_text).sents]
|
95 |
-
logs.
|
96 |
progress.progress(10)
|
97 |
|
98 |
# Compute frequency dict
|
@@ -104,7 +102,7 @@ def main_workflow(
|
|
104 |
if j < len(FREQ_DICT) * _frequency_barrier_percent:
|
105 |
MAX_FREQUENCY = tp[1]
|
106 |
MAX_FREQUENCY = 3 if MAX_FREQUENCY < 3 else MAX_FREQUENCY
|
107 |
-
logs.
|
108 |
progress.progress(15)
|
109 |
|
110 |
# Choose necessary language minimum according to user's input
|
@@ -136,7 +134,7 @@ def main_workflow(
|
|
136 |
st.stop()
|
137 |
|
138 |
# Define which model is used for distractor generation
|
139 |
-
|
140 |
if model_name == 'Модель-1':
|
141 |
mask_filler = load_w2v(w2v_model_1_path)
|
142 |
else:
|
@@ -145,7 +143,7 @@ def main_workflow(
|
|
145 |
# Start generation process
|
146 |
workflow = [SENTENCE(original=sent.strip(), n_sentence=num, max_num_distractors=num_distractors)
|
147 |
for num, sent in enumerate(current_text_sentences)]
|
148 |
-
logs.
|
149 |
progress.progress(20)
|
150 |
|
151 |
for sentence in workflow:
|
@@ -153,7 +151,7 @@ def main_workflow(
|
|
153 |
|
154 |
for sentence in workflow:
|
155 |
sentence.bind_phrases()
|
156 |
-
logs.
|
157 |
progress.progress(30)
|
158 |
|
159 |
for j, sentence in enumerate(workflow):
|
@@ -163,7 +161,7 @@ def main_workflow(
|
|
163 |
user_target_words=USER_TARGET_WORDS,
|
164 |
frequency_dict=FREQ_DICT)
|
165 |
progress.progress(int(30 + (j * (30 / len(workflow)))))
|
166 |
-
|
167 |
DUPLICATE_TARGET_WORDS = defaultdict(list)
|
168 |
for sentence in workflow:
|
169 |
for target_word in sentence.target_words:
|
@@ -176,8 +174,8 @@ def main_workflow(
|
|
176 |
if target_word not in RESULT_TW:
|
177 |
global_bad_target_words.append(target_word['original_text'])
|
178 |
sentence.target_words.remove(target_word)
|
179 |
-
|
180 |
-
logs.
|
181 |
|
182 |
for sentence in workflow:
|
183 |
sentence.attach_distractors_to_target_word(model=mask_filler,
|
@@ -185,22 +183,21 @@ def main_workflow(
|
|
185 |
distractor_minimum=distractor_minimum,
|
186 |
level_name=level,
|
187 |
max_frequency=MAX_FREQUENCY,
|
188 |
-
progress=progress_d
|
189 |
-
|
190 |
-
|
191 |
-
logs.success('Подобрали неправильные варианты!')
|
192 |
for sentence in workflow:
|
193 |
sentence.inflect_distractors()
|
194 |
-
|
195 |
-
logs.
|
196 |
|
197 |
for sentence in workflow:
|
198 |
sentence.filter_target_words(target_words_automatic_mode=tw_mode_automatic_mode)
|
199 |
|
200 |
for sentence in workflow:
|
201 |
sentence.sample_distractors(num_distractors=num_distractors)
|
202 |
-
|
203 |
-
logs.
|
204 |
|
205 |
RESULT_TASKS = []
|
206 |
for sentence in workflow:
|
@@ -244,8 +241,8 @@ def main_workflow(
|
|
244 |
f'{PREPARED_TASKS["KEYS_ONLY"]}'
|
245 |
TOTAL_OUT = f'{original_text}\n\n{"$" * 70}\n\n{STUDENT_OUT}\n\n{"=" * 70}\n\n{PREPARED_TASKS["TASKS_TEACHER"]}' \
|
246 |
f'\n\n{"$" * 70}\n\n{PREPARED_TASKS["KEYS_ONLY"]}'
|
247 |
-
logs.
|
248 |
-
|
249 |
save_name = save_name if save_name != '' else f'{str(datetime.datetime.now())[:-7]}_{original_text[:20]}'
|
250 |
out = {
|
251 |
'name': save_name,
|
|
|
22 |
file: UploadedFile or None,
|
23 |
text: str,
|
24 |
logs: ST_WIDGETS,
|
|
|
25 |
progress: st_progress,
|
26 |
progress_d: st_progress,
|
|
|
27 |
level: str,
|
28 |
tw_mode_automatic_mode: str,
|
29 |
target_words: str,
|
|
|
39 |
:param file: user's file to generate tasks in
|
40 |
:param text: user's text input to generate tasks in
|
41 |
:param logs: widget to output logs to
|
|
|
42 |
:param progress: progress bar
|
43 |
:param progress_d: distractors progress bar
|
|
|
44 |
:param target_words: how target words are chosen: by user or automatically
|
45 |
:param tw_mode_automatic_mode:
|
46 |
:param level: user's specification of CEFR level of text
|
|
|
51 |
:return: Dictionary with output data: filename, amount_mode, text_with_gaps, tasks_as_list, correct_answers,
|
52 |
student_out, teacher_out, total_out, original_text
|
53 |
"""
|
54 |
+
|
55 |
# Clear bad target_words each time
|
56 |
+
if global_bad_target_words:
|
57 |
+
global_bad_target_words = []
|
58 |
|
59 |
# Define main global variables
|
|
|
60 |
GLOBAL_DISTRACTORS = set()
|
61 |
MAX_FREQUENCY = 0
|
62 |
|
|
|
85 |
|
86 |
# Text preprocessing
|
87 |
original_text = current_text
|
88 |
+
current_text = (current_text.replace('.', '. ').replace('. . .', '...')
|
89 |
+
.replace(' ', ' ').replace('…', '...').replace('…', '...')
|
90 |
+
.replace('—', '-').replace('\u2014', '-').replace('—', '-')
|
91 |
+
.replace('-\n', '').replace('\n', '%^&*'))
|
92 |
current_text_sentences = [sent.text.strip() for sent in esp_constants.nlp(current_text).sents]
|
93 |
+
logs.update(label='Получили Ваш текст!', state='running')
|
94 |
progress.progress(10)
|
95 |
|
96 |
# Compute frequency dict
|
|
|
102 |
if j < len(FREQ_DICT) * _frequency_barrier_percent:
|
103 |
MAX_FREQUENCY = tp[1]
|
104 |
MAX_FREQUENCY = 3 if MAX_FREQUENCY < 3 else MAX_FREQUENCY
|
105 |
+
logs.update(label="Посчитали немного статистики!", state='running')
|
106 |
progress.progress(15)
|
107 |
|
108 |
# Choose necessary language minimum according to user's input
|
|
|
134 |
st.stop()
|
135 |
|
136 |
# Define which model is used for distractor generation
|
137 |
+
logs.update(label='Загружаем языковые модели и другие данные', state='running')
|
138 |
if model_name == 'Модель-1':
|
139 |
mask_filler = load_w2v(w2v_model_1_path)
|
140 |
else:
|
|
|
143 |
# Start generation process
|
144 |
workflow = [SENTENCE(original=sent.strip(), n_sentence=num, max_num_distractors=num_distractors)
|
145 |
for num, sent in enumerate(current_text_sentences)]
|
146 |
+
logs.update(label="Запускаем процесс генерации заданий!", state='running')
|
147 |
progress.progress(20)
|
148 |
|
149 |
for sentence in workflow:
|
|
|
151 |
|
152 |
for sentence in workflow:
|
153 |
sentence.bind_phrases()
|
154 |
+
logs.update(label="Подготовили предложения для дальнейшей работы!", state='running')
|
155 |
progress.progress(30)
|
156 |
|
157 |
for j, sentence in enumerate(workflow):
|
|
|
161 |
user_target_words=USER_TARGET_WORDS,
|
162 |
frequency_dict=FREQ_DICT)
|
163 |
progress.progress(int(30 + (j * (30 / len(workflow)))))
|
164 |
+
progress.progress(60)
|
165 |
DUPLICATE_TARGET_WORDS = defaultdict(list)
|
166 |
for sentence in workflow:
|
167 |
for target_word in sentence.target_words:
|
|
|
174 |
if target_word not in RESULT_TW:
|
175 |
global_bad_target_words.append(target_word['original_text'])
|
176 |
sentence.target_words.remove(target_word)
|
177 |
+
progress.progress(65)
|
178 |
+
logs.update(label='Выбрали слова-пропуски!', state='running')
|
179 |
|
180 |
for sentence in workflow:
|
181 |
sentence.attach_distractors_to_target_word(model=mask_filler,
|
|
|
183 |
distractor_minimum=distractor_minimum,
|
184 |
level_name=level,
|
185 |
max_frequency=MAX_FREQUENCY,
|
186 |
+
logs=logs, progress=progress_d)
|
187 |
+
progress.progress(70)
|
188 |
+
logs.update(label='Подобрали неправильные варианты!', state='running')
|
|
|
189 |
for sentence in workflow:
|
190 |
sentence.inflect_distractors()
|
191 |
+
progress.progress(80)
|
192 |
+
logs.update(label='Просклоняли и проспрягали неправильные варианты!', state='running')
|
193 |
|
194 |
for sentence in workflow:
|
195 |
sentence.filter_target_words(target_words_automatic_mode=tw_mode_automatic_mode)
|
196 |
|
197 |
for sentence in workflow:
|
198 |
sentence.sample_distractors(num_distractors=num_distractors)
|
199 |
+
progress.progress(90)
|
200 |
+
logs.update(label='Отобрали лучшие задания!', state='running')
|
201 |
|
202 |
RESULT_TASKS = []
|
203 |
for sentence in workflow:
|
|
|
241 |
f'{PREPARED_TASKS["KEYS_ONLY"]}'
|
242 |
TOTAL_OUT = f'{original_text}\n\n{"$" * 70}\n\n{STUDENT_OUT}\n\n{"=" * 70}\n\n{PREPARED_TASKS["TASKS_TEACHER"]}' \
|
243 |
f'\n\n{"$" * 70}\n\n{PREPARED_TASKS["KEYS_ONLY"]}'
|
244 |
+
logs.update(label='Сейчас все будет готово!', state='running')
|
245 |
+
progress.progress(90)
|
246 |
save_name = save_name if save_name != '' else f'{str(datetime.datetime.now())[:-7]}_{original_text[:20]}'
|
247 |
out = {
|
248 |
'name': save_name,
|
utilities_language_w2v/esp_sentence_w2v.py
CHANGED
@@ -150,7 +150,7 @@ class SENTENCE:
|
|
150 |
frequency_dict=frequency_dict)
|
151 |
|
152 |
def attach_distractors_to_target_word(self, model, global_distractors, distractor_minimum, level_name,
|
153 |
-
max_frequency,
|
154 |
n_target_words = len(self.target_words)
|
155 |
bad_target_words = []
|
156 |
for i, target_word in enumerate(self.target_words):
|
@@ -165,13 +165,14 @@ class SENTENCE:
|
|
165 |
target_word['distractors'] = distractors
|
166 |
target_word['distractors_number'] = len(distractors) if distractors is not None else 0
|
167 |
progress.progress(i / n_target_words)
|
168 |
-
logs.
|
|
|
|
|
169 |
for btw in bad_target_words:
|
170 |
BAD_USER_TARGET_WORDS.append(btw['original_text'])
|
171 |
self.target_words.remove(btw)
|
172 |
-
|
173 |
-
|
174 |
-
f'Обработали {n_target_words}/{n_target_words} слов в {self.n_sentence + 1}-м предложении')
|
175 |
|
176 |
def inflect_distractors(self):
|
177 |
bad_target_words = []
|
@@ -184,7 +185,7 @@ class SENTENCE:
|
|
184 |
inflected = inflect(lemma=distractor_lemma, target_pos=target_word['pos'],
|
185 |
target_tags=target_word['tags'])
|
186 |
else:
|
187 |
-
continue
|
188 |
else:
|
189 |
inflected = inflect(lemma=distractor_lemma, target_pos=target_word['pos'],
|
190 |
target_tags=target_word['tags'])
|
@@ -217,8 +218,7 @@ class SENTENCE:
|
|
217 |
len_inflected_distractors = len(target_word['inflected_distractors'])
|
218 |
num_distractors = min(len_inflected_distractors, num_distractors) \
|
219 |
if num_distractors >= 4 else num_distractors
|
220 |
-
target_word['inflected_distractors'] = sample(target_word['inflected_distractors']
|
221 |
-
len_inflected_distractors, 10)], num_distractors)
|
222 |
|
223 |
|
224 |
class TASK:
|
@@ -240,12 +240,13 @@ class TASK:
|
|
240 |
|
241 |
def compile_task(self, max_num_distractors):
|
242 |
len_distractors = len(self.inflected_distractors)
|
243 |
-
len_variants = min(len_distractors, max_num_distractors) if max_num_distractors
|
244 |
else max_num_distractors
|
245 |
letters = (f'({letter})' for letter in string.ascii_lowercase[:len_variants + 1])
|
246 |
try:
|
247 |
distractors = sample(self.inflected_distractors, len_variants) + [self.original_text, ]
|
248 |
-
except ValueError:
|
|
|
249 |
distractors = self.inflected_distractors + [self.original_text, ]
|
250 |
self.variants.append(
|
251 |
(self.original_text, [f'{item[0]} {item[1].replace("_", " ").lower()}'.lower()
|
|
|
150 |
frequency_dict=frequency_dict)
|
151 |
|
152 |
def attach_distractors_to_target_word(self, model, global_distractors, distractor_minimum, level_name,
|
153 |
+
max_frequency, logs, progress):
|
154 |
n_target_words = len(self.target_words)
|
155 |
bad_target_words = []
|
156 |
for i, target_word in enumerate(self.target_words):
|
|
|
165 |
target_word['distractors'] = distractors
|
166 |
target_word['distractors_number'] = len(distractors) if distractors is not None else 0
|
167 |
progress.progress(i / n_target_words)
|
168 |
+
logs.update(label=f'Обработали {i}/{n_target_words} слов в {self.n_sentence + 1}-м предложении',
|
169 |
+
state='running')
|
170 |
+
progress.progress(100)
|
171 |
for btw in bad_target_words:
|
172 |
BAD_USER_TARGET_WORDS.append(btw['original_text'])
|
173 |
self.target_words.remove(btw)
|
174 |
+
logs.update(label=f'Обработали {n_target_words}/{n_target_words} слов в {self.n_sentence + 1}-м предложении',
|
175 |
+
state='running')
|
|
|
176 |
|
177 |
def inflect_distractors(self):
|
178 |
bad_target_words = []
|
|
|
185 |
inflected = inflect(lemma=distractor_lemma, target_pos=target_word['pos'],
|
186 |
target_tags=target_word['tags'])
|
187 |
else:
|
188 |
+
continue # TODO
|
189 |
else:
|
190 |
inflected = inflect(lemma=distractor_lemma, target_pos=target_word['pos'],
|
191 |
target_tags=target_word['tags'])
|
|
|
218 |
len_inflected_distractors = len(target_word['inflected_distractors'])
|
219 |
num_distractors = min(len_inflected_distractors, num_distractors) \
|
220 |
if num_distractors >= 4 else num_distractors
|
221 |
+
target_word['inflected_distractors'] = sample(target_word['inflected_distractors'], num_distractors)
|
|
|
222 |
|
223 |
|
224 |
class TASK:
|
|
|
240 |
|
241 |
def compile_task(self, max_num_distractors):
|
242 |
len_distractors = len(self.inflected_distractors)
|
243 |
+
len_variants = min(len_distractors, max_num_distractors) if max_num_distractors >= 4 \
|
244 |
else max_num_distractors
|
245 |
letters = (f'({letter})' for letter in string.ascii_lowercase[:len_variants + 1])
|
246 |
try:
|
247 |
distractors = sample(self.inflected_distractors, len_variants) + [self.original_text, ]
|
248 |
+
except ValueError as e:
|
249 |
+
print(f'{e}\n{len_distractors=}\n{len_variants=}')
|
250 |
distractors = self.inflected_distractors + [self.original_text, ]
|
251 |
self.variants.append(
|
252 |
(self.original_text, [f'{item[0]} {item[1].replace("_", " ").lower()}'.lower()
|
utilities_ui/custom_download_button.py
CHANGED
@@ -4,7 +4,7 @@ import uuid
|
|
4 |
import base64
|
5 |
import streamlit as st
|
6 |
from typing import Optional, Union
|
7 |
-
from streamlit.elements.button import DownloadButtonDataType
|
8 |
|
9 |
DownloadButtonDataType = Union[DownloadButtonDataType, "pd.DataFrame", "Styler"]
|
10 |
|
|
|
4 |
import base64
|
5 |
import streamlit as st
|
6 |
from typing import Optional, Union
|
7 |
+
from streamlit.elements.widgets.button import DownloadButtonDataType
|
8 |
|
9 |
DownloadButtonDataType = Union[DownloadButtonDataType, "pd.DataFrame", "Styler"]
|
10 |
|