Spaces:
Running
Running
togokah
commited on
Commit
·
9efc4ef
1
Parent(s):
015d17f
Prepare for experiment and add morphology to bert
Browse files
pages/2_👨🏫_Начало_работы.py
CHANGED
@@ -128,13 +128,13 @@ if st.session_state.get('-LOGGED_IN_BOOL-'):
|
|
128 |
key='-TARGET_WORDS_MODE-', horizontal=True)
|
129 |
DISTRACTOR_MODEL = DISTRACTOR_MODEL_COL.radio(
|
130 |
label='**Модель для выбора неправильных вариантов**',
|
131 |
-
options=['
|
132 |
key='-DISTRACTOR_MODEL_MODE-', horizontal=True)
|
133 |
CEFR_NUM_DISTRACTORS_COL, UTW_COL = FORM.columns([2, 2])
|
134 |
with CEFR_NUM_DISTRACTORS_COL:
|
135 |
CEFR_TEXT_LEVEL = custom_select_box(
|
136 |
'Укажите уровень по CEFR:',
|
137 |
-
['A1', 'A2', 'B1', 'B2', 'C1', 'C2'
|
138 |
no_selection_label='-Выберите языковой уровень-')
|
139 |
st.session_state['-CEFR_TEXT_LEVEL-'] = CEFR_TEXT_LEVEL
|
140 |
NUMBER_DISTRACTORS = CEFR_NUM_DISTRACTORS_COL.number_input(
|
@@ -186,7 +186,7 @@ if st.session_state.get('-LOGGED_IN_BOOL-'):
|
|
186 |
PROGRESS_BAR_S = st.progress(0)
|
187 |
|
188 |
# Start generation process. Everything happens inside main_workflow func
|
189 |
-
if DISTRACTOR_MODEL == '
|
190 |
from utilities_language_bert.esp_main_workflow_bert import main_workflow
|
191 |
__TASK_DATA__ = main_workflow(
|
192 |
file=UPLOAD_FILE,
|
|
|
128 |
key='-TARGET_WORDS_MODE-', horizontal=True)
|
129 |
DISTRACTOR_MODEL = DISTRACTOR_MODEL_COL.radio(
|
130 |
label='**Модель для выбора неправильных вариантов**',
|
131 |
+
options=['Модель-1', 'Модель-2'],
|
132 |
key='-DISTRACTOR_MODEL_MODE-', horizontal=True)
|
133 |
CEFR_NUM_DISTRACTORS_COL, UTW_COL = FORM.columns([2, 2])
|
134 |
with CEFR_NUM_DISTRACTORS_COL:
|
135 |
CEFR_TEXT_LEVEL = custom_select_box(
|
136 |
'Укажите уровень по CEFR:',
|
137 |
+
['Без уровня', 'A1', 'A2', 'B1', 'B2', 'C1', 'C2'],
|
138 |
no_selection_label='-Выберите языковой уровень-')
|
139 |
st.session_state['-CEFR_TEXT_LEVEL-'] = CEFR_TEXT_LEVEL
|
140 |
NUMBER_DISTRACTORS = CEFR_NUM_DISTRACTORS_COL.number_input(
|
|
|
186 |
PROGRESS_BAR_S = st.progress(0)
|
187 |
|
188 |
# Start generation process. Everything happens inside main_workflow func
|
189 |
+
if DISTRACTOR_MODEL == 'Модель-2':
|
190 |
from utilities_language_bert.esp_main_workflow_bert import main_workflow
|
191 |
__TASK_DATA__ = main_workflow(
|
192 |
file=UPLOAD_FILE,
|
utilities_language_bert/esp_main_workflow_bert.py
CHANGED
@@ -122,7 +122,7 @@ def main_workflow(
|
|
122 |
elif level == 'C2':
|
123 |
target_minimum = esp_constants.c2_target_set
|
124 |
distractor_minimum = esp_constants.c2_distractor_set
|
125 |
-
elif level == '
|
126 |
target_minimum = None
|
127 |
distractor_minimum = None
|
128 |
else:
|
@@ -150,8 +150,8 @@ def main_workflow(
|
|
150 |
target_minimum=target_minimum,
|
151 |
user_target_words=USER_TARGET_WORDS,
|
152 |
frequency_dict=FREQ_DICT)
|
153 |
-
progress.progress(int(30 + (j * (
|
154 |
-
progress_s.progress(
|
155 |
DUPLICATE_TARGET_WORDS = defaultdict(list)
|
156 |
for sentence in workflow:
|
157 |
for target_word in sentence.target_words:
|
@@ -164,7 +164,7 @@ def main_workflow(
|
|
164 |
if target_word not in RESULT_TW:
|
165 |
global_bad_target_words.append(target_word['original_text'])
|
166 |
sentence.target_words.remove(target_word)
|
167 |
-
progress_s.progress(
|
168 |
logs.success('Выбрали слова-пропуски!')
|
169 |
|
170 |
for sentence in workflow:
|
@@ -176,7 +176,7 @@ def main_workflow(
|
|
176 |
|
177 |
for sentence in workflow:
|
178 |
sentence.filter_target_words(target_words_automatic_mode=tw_mode_automatic_mode)
|
179 |
-
progress_s.progress(
|
180 |
|
181 |
RESULT_TASKS = []
|
182 |
for sentence in workflow:
|
@@ -193,9 +193,14 @@ def main_workflow(
|
|
193 |
f'Обработали {num}/{len(RESULT_TASKS)} целевых слов!')
|
194 |
logs_d.success(
|
195 |
f'Обработали {len(RESULT_TASKS)}/{len(RESULT_TASKS)} целевых слов!')
|
|
|
|
|
196 |
|
|
|
|
|
197 |
progress_s.progress(70)
|
198 |
-
logs.success('
|
|
|
199 |
for task in RESULT_TASKS:
|
200 |
task.sample_distractors(num_distractors=num_distractors)
|
201 |
progress_s.progress(75)
|
|
|
122 |
elif level == 'C2':
|
123 |
target_minimum = esp_constants.c2_target_set
|
124 |
distractor_minimum = esp_constants.c2_distractor_set
|
125 |
+
elif level == 'Без уровня':
|
126 |
target_minimum = None
|
127 |
distractor_minimum = None
|
128 |
else:
|
|
|
150 |
target_minimum=target_minimum,
|
151 |
user_target_words=USER_TARGET_WORDS,
|
152 |
frequency_dict=FREQ_DICT)
|
153 |
+
progress.progress(int(30 + (j * (20 / len(workflow)))))
|
154 |
+
progress_s.progress(50)
|
155 |
DUPLICATE_TARGET_WORDS = defaultdict(list)
|
156 |
for sentence in workflow:
|
157 |
for target_word in sentence.target_words:
|
|
|
164 |
if target_word not in RESULT_TW:
|
165 |
global_bad_target_words.append(target_word['original_text'])
|
166 |
sentence.target_words.remove(target_word)
|
167 |
+
progress_s.progress(55)
|
168 |
logs.success('Выбрали слова-пропуски!')
|
169 |
|
170 |
for sentence in workflow:
|
|
|
176 |
|
177 |
for sentence in workflow:
|
178 |
sentence.filter_target_words(target_words_automatic_mode=tw_mode_automatic_mode)
|
179 |
+
progress_s.progress(60)
|
180 |
|
181 |
RESULT_TASKS = []
|
182 |
for sentence in workflow:
|
|
|
193 |
f'Обработали {num}/{len(RESULT_TASKS)} целевых слов!')
|
194 |
logs_d.success(
|
195 |
f'Обработали {len(RESULT_TASKS)}/{len(RESULT_TASKS)} целевых слов!')
|
196 |
+
progress_s.progress(65)
|
197 |
+
logs.success('Подобрали неправильные варианты!')
|
198 |
|
199 |
+
for task in RESULT_TASKS:
|
200 |
+
task.inflect_distractors()
|
201 |
progress_s.progress(70)
|
202 |
+
logs.success('Просклоняли и проспрягали неправильные варианты!')
|
203 |
+
|
204 |
for task in RESULT_TASKS:
|
205 |
task.sample_distractors(num_distractors=num_distractors)
|
206 |
progress_s.progress(75)
|
utilities_language_bert/esp_sentence_bert.py
CHANGED
@@ -2,6 +2,7 @@ import string
|
|
2 |
from random import random
|
3 |
from random import sample
|
4 |
from utilities_language_general.esp_constants import nlp
|
|
|
5 |
from utilities_language_general.esp_constants import PHRASES
|
6 |
from utilities_language_general.esp_utils import check_token_bert
|
7 |
from utilities_language_general.esp_utils import fix_irregular_lemma
|
@@ -169,7 +170,9 @@ class TASK:
|
|
169 |
self.distractors = None
|
170 |
self.distractors_number = 0
|
171 |
self.bad_target_word = False
|
|
|
172 |
self.pos = task_data['pos']
|
|
|
173 |
self.lemma = task_data['lemma']
|
174 |
self.gender = task_data['gender']
|
175 |
self.max_num_distractors = max_num_distractors
|
@@ -208,20 +211,40 @@ class TASK:
|
|
208 |
self.distractors = [d[0] for i, d in enumerate(distractors_sentence) if i < 15]
|
209 |
self.distractors_number = len(distractors_sentence) if distractors_sentence is not None else 0
|
210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
def sample_distractors(self, num_distractors):
|
212 |
if not self.bad_target_word:
|
213 |
num_distractors = min(self.distractors_number, num_distractors) if num_distractors >= 4 else num_distractors
|
214 |
self.distractors = sample(self.distractors[:min(self.distractors_number, 10)], num_distractors)
|
215 |
|
216 |
def compile_task(self, max_num_distractors):
|
217 |
-
len_distractors = len(self.
|
218 |
len_variants = min(len_distractors, max_num_distractors) if max_num_distractors > 4 \
|
219 |
else max_num_distractors
|
220 |
letters = (f'({letter})' for letter in string.ascii_lowercase[:len_variants + 1])
|
221 |
try:
|
222 |
-
distractors = sample(self.
|
223 |
except ValueError:
|
224 |
-
distractors = self.
|
225 |
tmp_vars = [f'{item[0]} {item[1].replace("_", " ")}'
|
226 |
for item in zip(letters, sorted(distractors, key=lambda _: random()))]
|
227 |
self.variants.append((self.original_text, tmp_vars))
|
|
|
2 |
from random import random
|
3 |
from random import sample
|
4 |
from utilities_language_general.esp_constants import nlp
|
5 |
+
from utilities_language_general.morphology import inflect
|
6 |
from utilities_language_general.esp_constants import PHRASES
|
7 |
from utilities_language_general.esp_utils import check_token_bert
|
8 |
from utilities_language_general.esp_utils import fix_irregular_lemma
|
|
|
170 |
self.distractors = None
|
171 |
self.distractors_number = 0
|
172 |
self.bad_target_word = False
|
173 |
+
self.inflected_distractors = None
|
174 |
self.pos = task_data['pos']
|
175 |
+
self.tags = task_data['tags']
|
176 |
self.lemma = task_data['lemma']
|
177 |
self.gender = task_data['gender']
|
178 |
self.max_num_distractors = max_num_distractors
|
|
|
211 |
self.distractors = [d[0] for i, d in enumerate(distractors_sentence) if i < 15]
|
212 |
self.distractors_number = len(distractors_sentence) if distractors_sentence is not None else 0
|
213 |
|
214 |
+
def inflect_distractors(self):
|
215 |
+
inflected_distractors = []
|
216 |
+
for distractor_lemma, distractor_similarity in self.distractors:
|
217 |
+
if distractor_lemma.count('_') > 1:
|
218 |
+
if distractor_lemma.startswith('haber_'):
|
219 |
+
distractor_lemma = distractor_lemma.split('_')[-2]
|
220 |
+
inflected = inflect(lemma=distractor_lemma, target_pos=self.pos, target_tags=self.tags)
|
221 |
+
else:
|
222 |
+
continue
|
223 |
+
else:
|
224 |
+
inflected = inflect(lemma=distractor_lemma, target_pos=self.pos, target_tags=self.tags)
|
225 |
+
if inflected is not None:
|
226 |
+
inflected_distractors.append(inflected)
|
227 |
+
num_distractors = min(4, self.max_num_distractors) if self.max_num_distractors >= 4 \
|
228 |
+
else self.max_num_distractors
|
229 |
+
if len(inflected_distractors) < num_distractors:
|
230 |
+
self.bad_target_word = True
|
231 |
+
else:
|
232 |
+
self.inflected_distractors = inflected_distractors
|
233 |
+
|
234 |
def sample_distractors(self, num_distractors):
|
235 |
if not self.bad_target_word:
|
236 |
num_distractors = min(self.distractors_number, num_distractors) if num_distractors >= 4 else num_distractors
|
237 |
self.distractors = sample(self.distractors[:min(self.distractors_number, 10)], num_distractors)
|
238 |
|
239 |
def compile_task(self, max_num_distractors):
|
240 |
+
len_distractors = len(self.inflected_distractors)
|
241 |
len_variants = min(len_distractors, max_num_distractors) if max_num_distractors > 4 \
|
242 |
else max_num_distractors
|
243 |
letters = (f'({letter})' for letter in string.ascii_lowercase[:len_variants + 1])
|
244 |
try:
|
245 |
+
distractors = sample(self.inflected_distractors, len_variants) + [self.original_text, ]
|
246 |
except ValueError:
|
247 |
+
distractors = self.inflected_distractors + [self.original_text, ]
|
248 |
tmp_vars = [f'{item[0]} {item[1].replace("_", " ")}'
|
249 |
for item in zip(letters, sorted(distractors, key=lambda _: random()))]
|
250 |
self.variants.append((self.original_text, tmp_vars))
|
utilities_language_general/esp_constants.py
CHANGED
@@ -23,9 +23,7 @@ def load_bert():
|
|
23 |
|
24 |
|
25 |
nlp = load_spacy()
|
26 |
-
|
27 |
-
all_model_path = r'ALL_annot_all_pos_spell_g_h_new_phrases_s300_cw10_mc100_w4_negative5-075_mean_e10_shr.bin.gz'
|
28 |
-
lit_model_path = r'LITERATURA_annot_all_pos_spell_g_h_phrases_s300_cw10_mc50_w4_negative_5-075_mean_e20_shr.bin.gz'
|
29 |
|
30 |
# Upload minimums
|
31 |
a1_path, a1_target_set = r'lexical_minimums/A1_MINIMUM.txt', set()
|
|
|
23 |
|
24 |
|
25 |
nlp = load_spacy()
|
26 |
+
w2v_model_path = r'ALL_annot_all_pos_spell_g_h_new_phrases_s300_cw10_mc100_w4_negative5-075_mean_e10_shr.bin.gz'
|
|
|
|
|
27 |
|
28 |
# Upload minimums
|
29 |
a1_path, a1_target_set = r'lexical_minimums/A1_MINIMUM.txt', set()
|
utilities_language_general/esp_utils.py
CHANGED
@@ -190,23 +190,22 @@ def get_distractors_from_model_bert(model, text_with_masked_task: str, lemma: st
|
|
190 |
targets=list(distractor_minimum))]
|
191 |
else:
|
192 |
bert_candidates = [token for token in model(text_with_masked_task, top_k=max_num_distractors + 100)]
|
193 |
-
|
194 |
for candidate in bert_candidates:
|
195 |
if isinstance(candidate, list):
|
196 |
bert_candidates = candidate
|
197 |
continue
|
198 |
if candidate['token_str'].isalpha():
|
199 |
candidate_morph = nlp(candidate['token_str'])[0]
|
200 |
-
|
201 |
-
candidate['score']))
|
202 |
except KeyError:
|
203 |
return None
|
204 |
-
for candidate_distractor in
|
205 |
if '_' in candidate_distractor[0]:
|
206 |
-
distractor_lemma,
|
207 |
else:
|
208 |
-
|
209 |
-
|
210 |
distractor_similarity = candidate_distractor[1]
|
211 |
candidate_gender = get_tags(distractor_lemma).get('Gender')
|
212 |
length_ratio = abs(len(lemma) - len(distractor_lemma))
|
@@ -222,10 +221,10 @@ def get_distractors_from_model_bert(model, text_with_masked_task: str, lemma: st
|
|
222 |
/ ((len(lemma) + len(distractor_lemma)) / 2) > min_edit_distance_ratio)):
|
223 |
if distractor_minimum is not None:
|
224 |
if distractor_lemma in distractor_minimum:
|
225 |
-
_distractors.append((
|
226 |
global_distractors.add(distractor_lemma)
|
227 |
else:
|
228 |
-
_distractors.append((
|
229 |
num_distractors = min(4, max_num_distractors) if max_num_distractors >= 4 else max_num_distractors
|
230 |
if len(_distractors) < num_distractors:
|
231 |
return None
|
|
|
190 |
targets=list(distractor_minimum))]
|
191 |
else:
|
192 |
bert_candidates = [token for token in model(text_with_masked_task, top_k=max_num_distractors + 100)]
|
193 |
+
candidates = []
|
194 |
for candidate in bert_candidates:
|
195 |
if isinstance(candidate, list):
|
196 |
bert_candidates = candidate
|
197 |
continue
|
198 |
if candidate['token_str'].isalpha():
|
199 |
candidate_morph = nlp(candidate['token_str'])[0]
|
200 |
+
candidates.append((f"{candidate_morph.lemma_}_{candidate_morph.pos_}", candidate['score']))
|
|
|
201 |
except KeyError:
|
202 |
return None
|
203 |
+
for candidate_distractor in candidates:
|
204 |
if '_' in candidate_distractor[0]:
|
205 |
+
distractor_lemma, distractor_pos = candidate_distractor[0].split('_')
|
206 |
else:
|
207 |
+
candidate_morph = nlp(candidate_distractor[0])[0]
|
208 |
+
distractor_lemma, distractor_pos = candidate_morph.lemma_, candidate_morph.pos_
|
209 |
distractor_similarity = candidate_distractor[1]
|
210 |
candidate_gender = get_tags(distractor_lemma).get('Gender')
|
211 |
length_ratio = abs(len(lemma) - len(distractor_lemma))
|
|
|
221 |
/ ((len(lemma) + len(distractor_lemma)) / 2) > min_edit_distance_ratio)):
|
222 |
if distractor_minimum is not None:
|
223 |
if distractor_lemma in distractor_minimum:
|
224 |
+
_distractors.append((distractor_lemma, candidate_distractor[1]))
|
225 |
global_distractors.add(distractor_lemma)
|
226 |
else:
|
227 |
+
_distractors.append((distractor_lemma, candidate_distractor[1]))
|
228 |
num_distractors = min(4, max_num_distractors) if max_num_distractors >= 4 else max_num_distractors
|
229 |
if len(_distractors) < num_distractors:
|
230 |
return None
|
utilities_language_w2v/esp_main_workflow_w2v.py
CHANGED
@@ -11,9 +11,7 @@ from utilities_language_general.esp_constants import load_w2v
|
|
11 |
from utilities_language_general.esp_utils import prepare_tasks
|
12 |
from streamlit.runtime.uploaded_file_manager import UploadedFile
|
13 |
import utilities_language_general.esp_constants as esp_constants
|
14 |
-
from utilities_language_general.esp_constants import
|
15 |
-
from utilities_language_general.esp_constants import lit_model_path
|
16 |
-
from utilities_language_general.esp_constants import news_model_path
|
17 |
from utilities_language_general.esp_utils import prepare_target_words
|
18 |
from utilities_language_general.esp_utils import compute_frequency_dict
|
19 |
from utilities_language_general.esp_constants import BAD_USER_TARGET_WORDS
|
@@ -65,12 +63,7 @@ def main_workflow(
|
|
65 |
MAX_FREQUENCY = 0
|
66 |
|
67 |
# Define which model is used for distractor generation
|
68 |
-
|
69 |
-
mask_filler = load_w2v(lit_model_path)
|
70 |
-
elif distractor_model == 'Новости':
|
71 |
-
mask_filler = load_w2v(news_model_path)
|
72 |
-
else:
|
73 |
-
mask_filler = load_w2v(all_model_path)
|
74 |
|
75 |
# Get input text
|
76 |
if file is not None:
|
@@ -136,7 +129,7 @@ def main_workflow(
|
|
136 |
elif level == 'C2':
|
137 |
target_minimum = esp_constants.c2_target_set
|
138 |
distractor_minimum = esp_constants.c2_distractor_set
|
139 |
-
elif level == '
|
140 |
target_minimum = None
|
141 |
distractor_minimum = None
|
142 |
else:
|
|
|
11 |
from utilities_language_general.esp_utils import prepare_tasks
|
12 |
from streamlit.runtime.uploaded_file_manager import UploadedFile
|
13 |
import utilities_language_general.esp_constants as esp_constants
|
14 |
+
from utilities_language_general.esp_constants import w2v_model_path
|
|
|
|
|
15 |
from utilities_language_general.esp_utils import prepare_target_words
|
16 |
from utilities_language_general.esp_utils import compute_frequency_dict
|
17 |
from utilities_language_general.esp_constants import BAD_USER_TARGET_WORDS
|
|
|
63 |
MAX_FREQUENCY = 0
|
64 |
|
65 |
# Define which model is used for distractor generation
|
66 |
+
mask_filler = load_w2v(w2v_model_path)
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
# Get input text
|
69 |
if file is not None:
|
|
|
129 |
elif level == 'C2':
|
130 |
target_minimum = esp_constants.c2_target_set
|
131 |
distractor_minimum = esp_constants.c2_distractor_set
|
132 |
+
elif level == 'Без уровня':
|
133 |
target_minimum = None
|
134 |
distractor_minimum = None
|
135 |
else:
|