reverting
Browse files- handler.py +20 -139
handler.py
CHANGED
@@ -3,9 +3,6 @@ from scipy.special import softmax
|
|
3 |
import numpy as np
|
4 |
import weakref
|
5 |
import re
|
6 |
-
import nltk
|
7 |
-
from nltk.corpus import stopwords
|
8 |
-
nltk.download('stopwords')
|
9 |
|
10 |
from utils import clean_str, clean_str_nopunct
|
11 |
import torch
|
@@ -13,7 +10,7 @@ from utils import MultiHeadModel, BertInputBuilder, get_num_words, MATH_PREFIXES
|
|
13 |
|
14 |
import transformers
|
15 |
from transformers import BertTokenizer, BertForSequenceClassification
|
16 |
-
|
17 |
|
18 |
transformers.logging.set_verbosity_debug()
|
19 |
|
@@ -33,15 +30,9 @@ class Utterance:
|
|
33 |
self.endtime = endtime
|
34 |
self.transcript = weakref.ref(transcript) if transcript else None
|
35 |
self.props = kwargs
|
36 |
-
self.role = None
|
37 |
-
self.word_count = self.get_num_words()
|
38 |
-
self.timestamp = [starttime, endtime]
|
39 |
-
self.unit_measure = None
|
40 |
-
self.aggregate_unit_measure = endtime
|
41 |
self.num_math_terms = None
|
42 |
self.math_terms = None
|
43 |
|
44 |
-
# moments
|
45 |
self.uptake = None
|
46 |
self.reasoning = None
|
47 |
self.question = None
|
@@ -71,20 +62,6 @@ class Utterance:
|
|
71 |
**self.props
|
72 |
}
|
73 |
|
74 |
-
def to_talk_timeline_dict(self):
|
75 |
-
return{
|
76 |
-
'speaker': self.speaker,
|
77 |
-
'text': self.text,
|
78 |
-
'role': self.role,
|
79 |
-
'timestamp': self.timestamp,
|
80 |
-
'moments': {'reasoning': True if self.reasoning else False, 'questioning': True if self.question else False, 'uptake': True if self.uptake else False, 'focusingQuestion': True if self.focusing_question else False},
|
81 |
-
'unitMeasure': self.unit_measure,
|
82 |
-
'aggregateUnitMeasure': self.aggregate_unit_measure,
|
83 |
-
'wordCount': self.word_count,
|
84 |
-
'numMathTerms': self.num_math_terms,
|
85 |
-
'mathTerms': self.math_terms
|
86 |
-
}
|
87 |
-
|
88 |
def __repr__(self):
|
89 |
return f"Utterance(speaker='{self.speaker}'," \
|
90 |
f"text='{self.text}', uid={self.uid}," \
|
@@ -114,86 +91,6 @@ class Transcript:
|
|
114 |
def length(self):
|
115 |
return len(self.utterances)
|
116 |
|
117 |
-
def update_utterance_roles(self, uptake_speaker):
|
118 |
-
for utt in self.utterances:
|
119 |
-
if (utt.speaker == uptake_speaker):
|
120 |
-
utt.role = 'teacher'
|
121 |
-
else:
|
122 |
-
utt.role = 'student'
|
123 |
-
|
124 |
-
def get_talk_distribution_and_length(self, uptake_speaker):
|
125 |
-
if ((uptake_speaker is None)):
|
126 |
-
return None
|
127 |
-
teacher_words = 0
|
128 |
-
teacher_utt_count = 0
|
129 |
-
student_words = 0
|
130 |
-
student_utt_count = 0
|
131 |
-
for utt in self.utterances:
|
132 |
-
if (utt.speaker == uptake_speaker):
|
133 |
-
utt.role = 'teacher'
|
134 |
-
teacher_words += utt.get_num_words()
|
135 |
-
teacher_utt_count += 1
|
136 |
-
else:
|
137 |
-
utt.role = 'student'
|
138 |
-
student_words += utt.get_num_words()
|
139 |
-
student_utt_count += 1
|
140 |
-
teacher_percentage = round(
|
141 |
-
(teacher_words / (teacher_words + student_words)) * 100)
|
142 |
-
student_percentage = 100 - teacher_percentage
|
143 |
-
avg_teacher_length = teacher_words / teacher_utt_count
|
144 |
-
avg_student_length = student_words / student_utt_count
|
145 |
-
return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
|
146 |
-
|
147 |
-
def get_word_cloud_dicts(self):
|
148 |
-
teacher_dict = {}
|
149 |
-
student_dict = {}
|
150 |
-
uptake_teacher_dict = {}
|
151 |
-
stop_words = stopwords.words('english')
|
152 |
-
# stopwords = nltk.corpus.stopwords.word('english')
|
153 |
-
# print("stopwords: ", stopwords)
|
154 |
-
for utt in self.utterances:
|
155 |
-
words = (utt.get_clean_text(remove_punct=True)).split(' ')
|
156 |
-
for word in words:
|
157 |
-
if word in stop_words: continue
|
158 |
-
if utt.role == 'teacher':
|
159 |
-
if word not in teacher_dict:
|
160 |
-
teacher_dict[word] = 0
|
161 |
-
teacher_dict[word] += 1
|
162 |
-
if utt.uptake == 1:
|
163 |
-
if word not in uptake_teacher_dict:
|
164 |
-
uptake_teacher_dict[word] = 0
|
165 |
-
uptake_teacher_dict[word] += 1
|
166 |
-
else:
|
167 |
-
if word not in student_dict:
|
168 |
-
student_dict[word] = 0
|
169 |
-
student_dict[word] += 1
|
170 |
-
dict_list = []
|
171 |
-
uptake_dict_list = []
|
172 |
-
for word in uptake_teacher_dict.keys():
|
173 |
-
uptake_dict_list.append({'text': word, 'value': uptake_teacher_dict[word], 'category': 'teacher'})
|
174 |
-
for word in teacher_dict.keys():
|
175 |
-
dict_list.append(
|
176 |
-
{'text': word, 'value': teacher_dict[word], 'category': 'teacher'})
|
177 |
-
for word in student_dict.keys():
|
178 |
-
dict_list.append(
|
179 |
-
{'text': word, 'value': student_dict[word], 'category': 'student'})
|
180 |
-
sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
|
181 |
-
sorted_uptake_dict_list = sorted(uptake_dict_list, key=lambda x: x['value'], reverse=True)
|
182 |
-
return sorted_dict_list[:50], sorted_uptake_dict_list[:50]
|
183 |
-
|
184 |
-
def get_talk_timeline(self):
|
185 |
-
return [utterance.to_talk_timeline_dict() for utterance in self.utterances]
|
186 |
-
|
187 |
-
def calculate_aggregate_word_count(self):
|
188 |
-
unit_measures = [utt.unit_measure for utt in self.utterances]
|
189 |
-
if None in unit_measures:
|
190 |
-
aggregate_word_count = 0
|
191 |
-
for utt in self.utterances:
|
192 |
-
aggregate_word_count += utt.get_num_words()
|
193 |
-
utt.unit_measure = utt.get_num_words()
|
194 |
-
utt.aggregate_unit_measure = aggregate_word_count
|
195 |
-
|
196 |
-
|
197 |
def to_dict(self):
|
198 |
return {
|
199 |
'utterances': [utterance.to_dict() for utterance in self.utterances],
|
@@ -321,6 +218,8 @@ class UptakeModel:
|
|
321 |
return_pooler_output=False)
|
322 |
return output
|
323 |
|
|
|
|
|
324 |
class FocusingQuestionModel:
|
325 |
def __init__(self, device, tokenizer, input_builder, max_length=128, path=FOCUSING_QUESTION_MODEL):
|
326 |
print("Loading models...")
|
@@ -355,7 +254,8 @@ class FocusingQuestionModel:
|
|
355 |
output = self.model(input_ids=instance["input_ids"],
|
356 |
attention_mask=instance["attention_mask"],
|
357 |
token_type_ids=instance["token_type_ids"])
|
358 |
-
return output
|
|
|
359 |
|
360 |
def load_math_terms():
|
361 |
math_terms = []
|
@@ -365,29 +265,23 @@ def load_math_terms():
|
|
365 |
math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
|
366 |
math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
|
367 |
else:
|
368 |
-
|
369 |
-
|
370 |
return math_terms, math_terms_dict
|
371 |
|
372 |
def run_math_density(transcript):
|
373 |
math_terms, math_terms_dict = load_math_terms()
|
374 |
-
|
375 |
-
|
376 |
text = utt.get_clean_text(remove_punct=False)
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
match_list.append(math_terms_dict[term])
|
386 |
-
# Update matched positions
|
387 |
-
matched_positions.update((match.start(), match.end()) for match in matches)
|
388 |
-
num_matches += len(matches)
|
389 |
-
utt.num_math_terms = num_matches
|
390 |
-
utt.math_terms = match_list
|
391 |
|
392 |
class EndpointHandler():
|
393 |
def __init__(self, path="."):
|
@@ -419,13 +313,13 @@ class EndpointHandler():
|
|
419 |
transcript.add_utterance(Utterance(**utt))
|
420 |
|
421 |
print("Running inference on %d examples..." % transcript.length())
|
422 |
-
|
423 |
# Uptake
|
424 |
uptake_model = UptakeModel(
|
425 |
self.device, self.tokenizer, self.input_builder)
|
426 |
-
uptake_speaker = params.pop("uptake_speaker", None)
|
427 |
uptake_model.run_inference(transcript, min_prev_words=params['uptake_min_num_words'],
|
428 |
uptake_speaker=uptake_speaker)
|
|
|
429 |
# Reasoning
|
430 |
reasoning_model = ReasoningModel(
|
431 |
self.device, self.tokenizer, self.input_builder)
|
@@ -443,17 +337,4 @@ class EndpointHandler():
|
|
443 |
|
444 |
run_math_density(transcript)
|
445 |
|
446 |
-
transcript.
|
447 |
-
transcript.calculate_aggregate_word_count()
|
448 |
-
return_dict = {'talkDistribution': None, 'talkLength': None, 'talkMoments': None, 'commonTopWords': None, 'uptakeTopWords': None}
|
449 |
-
talk_dist, talk_len = transcript.get_talk_distribution_and_length(uptake_speaker)
|
450 |
-
return_dict['talkDistribution'] = talk_dist
|
451 |
-
return_dict['talkLength'] = talk_len
|
452 |
-
talk_moments = transcript.get_talk_timeline()
|
453 |
-
return_dict['talkMoments'] = talk_moments
|
454 |
-
word_cloud, uptake_word_cloud = transcript.get_word_cloud_dicts()
|
455 |
-
return_dict['commonTopWords'] = word_cloud
|
456 |
-
return_dict['uptakeTopwords'] = uptake_word_cloud
|
457 |
-
|
458 |
-
|
459 |
-
return return_dict
|
|
|
3 |
import numpy as np
|
4 |
import weakref
|
5 |
import re
|
|
|
|
|
|
|
6 |
|
7 |
from utils import clean_str, clean_str_nopunct
|
8 |
import torch
|
|
|
10 |
|
11 |
import transformers
|
12 |
from transformers import BertTokenizer, BertForSequenceClassification
|
13 |
+
|
14 |
|
15 |
transformers.logging.set_verbosity_debug()
|
16 |
|
|
|
30 |
self.endtime = endtime
|
31 |
self.transcript = weakref.ref(transcript) if transcript else None
|
32 |
self.props = kwargs
|
|
|
|
|
|
|
|
|
|
|
33 |
self.num_math_terms = None
|
34 |
self.math_terms = None
|
35 |
|
|
|
36 |
self.uptake = None
|
37 |
self.reasoning = None
|
38 |
self.question = None
|
|
|
62 |
**self.props
|
63 |
}
|
64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
def __repr__(self):
|
66 |
return f"Utterance(speaker='{self.speaker}'," \
|
67 |
f"text='{self.text}', uid={self.uid}," \
|
|
|
91 |
def length(self):
|
92 |
return len(self.utterances)
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
def to_dict(self):
|
95 |
return {
|
96 |
'utterances': [utterance.to_dict() for utterance in self.utterances],
|
|
|
218 |
return_pooler_output=False)
|
219 |
return output
|
220 |
|
221 |
+
|
222 |
+
|
223 |
class FocusingQuestionModel:
|
224 |
def __init__(self, device, tokenizer, input_builder, max_length=128, path=FOCUSING_QUESTION_MODEL):
|
225 |
print("Loading models...")
|
|
|
254 |
output = self.model(input_ids=instance["input_ids"],
|
255 |
attention_mask=instance["attention_mask"],
|
256 |
token_type_ids=instance["token_type_ids"])
|
257 |
+
return output
|
258 |
+
|
259 |
|
260 |
def load_math_terms():
|
261 |
math_terms = []
|
|
|
265 |
math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
|
266 |
math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
|
267 |
else:
|
268 |
+
math_terms_dict[f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)"] = term
|
269 |
+
math_terms.append(f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)")
|
270 |
return math_terms, math_terms_dict
|
271 |
|
272 |
def run_math_density(transcript):
|
273 |
math_terms, math_terms_dict = load_math_terms()
|
274 |
+
for i, utt in enumerate(transcript.utterances):
|
275 |
+
found_math_terms = set()
|
276 |
text = utt.get_clean_text(remove_punct=False)
|
277 |
+
num_math_terms = 0
|
278 |
+
for term in math_terms:
|
279 |
+
count = len(re.findall(term, text))
|
280 |
+
if count > 0:
|
281 |
+
found_math_terms.add(math_terms_dict[term])
|
282 |
+
num_math_terms += count
|
283 |
+
utt.num_math_terms = num_math_terms
|
284 |
+
utt.math_terms = list(found_math_terms)
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
|
286 |
class EndpointHandler():
|
287 |
def __init__(self, path="."):
|
|
|
313 |
transcript.add_utterance(Utterance(**utt))
|
314 |
|
315 |
print("Running inference on %d examples..." % transcript.length())
|
316 |
+
uptake_speaker = params.pop("uptake_speaker", None)
|
317 |
# Uptake
|
318 |
uptake_model = UptakeModel(
|
319 |
self.device, self.tokenizer, self.input_builder)
|
|
|
320 |
uptake_model.run_inference(transcript, min_prev_words=params['uptake_min_num_words'],
|
321 |
uptake_speaker=uptake_speaker)
|
322 |
+
|
323 |
# Reasoning
|
324 |
reasoning_model = ReasoningModel(
|
325 |
self.device, self.tokenizer, self.input_builder)
|
|
|
337 |
|
338 |
run_math_density(transcript)
|
339 |
|
340 |
+
return transcript.to_dict()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|