hylee
commited on
Commit
·
f8d71c4
1
Parent(s):
8d1633a
add new output features
Browse files- handler.py +126 -36
handler.py
CHANGED
@@ -9,7 +9,6 @@ from utils import MultiHeadModel, BertInputBuilder, get_num_words
|
|
9 |
|
10 |
import transformers
|
11 |
from transformers import BertTokenizer, BertForSequenceClassification
|
12 |
-
import psutil
|
13 |
from transformers.utils import logging
|
14 |
from edu_toolkit import language_analysis
|
15 |
|
@@ -30,7 +29,13 @@ class Utterance:
|
|
30 |
self.endtime = endtime
|
31 |
self.transcript = weakref.ref(transcript) if transcript else None
|
32 |
self.props = kwargs
|
|
|
|
|
|
|
|
|
|
|
33 |
|
|
|
34 |
self.uptake = None
|
35 |
self.reasoning = None
|
36 |
self.question = None
|
@@ -56,6 +61,18 @@ class Utterance:
|
|
56 |
**self.props
|
57 |
}
|
58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
def __repr__(self):
|
60 |
return f"Utterance(speaker='{self.speaker}'," \
|
61 |
f"text='{self.text}', uid={self.uid}," \
|
@@ -85,6 +102,56 @@ class Transcript:
|
|
85 |
def length(self):
|
86 |
return len(self.utterances)
|
87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
def to_dict(self):
|
89 |
return {
|
90 |
'utterances': [utterance.to_dict() for utterance in self.utterances],
|
@@ -243,27 +310,30 @@ class EndpointHandler():
|
|
243 |
transcript.add_utterance(Utterance(**utt))
|
244 |
|
245 |
print("Running inference on %d examples..." % transcript.length())
|
246 |
-
cpu_percent = psutil.cpu_percent()
|
247 |
logging.set_verbosity_info()
|
248 |
-
logger = logging.get_logger("transformers")
|
249 |
-
logger.info(f"CPU Usage before models loaded: {cpu_percent}%")
|
250 |
-
mem_info = psutil.virtual_memory()
|
251 |
-
used_mem = mem_info.used / (1024 ** 3) # Convert to gigabytes
|
252 |
-
total_mem = mem_info.total / (1024 ** 3) # Convert to gigabytes
|
253 |
-
logger.info(
|
254 |
-
|
|
|
255 |
# Uptake
|
256 |
uptake_model = UptakeModel(
|
257 |
self.device, self.tokenizer, self.input_builder)
|
|
|
258 |
uptake_model.run_inference(transcript, min_prev_words=params['uptake_min_num_words'],
|
259 |
-
uptake_speaker=
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
|
|
267 |
# del uptake_model
|
268 |
# cpu_percent = psutil.cpu_percent()
|
269 |
# mem_info = psutil.virtual_memory()
|
@@ -275,16 +345,16 @@ class EndpointHandler():
|
|
275 |
reasoning_model = ReasoningModel(
|
276 |
self.device, self.tokenizer, self.input_builder)
|
277 |
reasoning_model.run_inference(transcript)
|
278 |
-
cpu_percent = psutil.cpu_percent()
|
279 |
-
mem_info = psutil.virtual_memory()
|
280 |
-
used_mem = mem_info.used / (1024 ** 3) # Convert to gigabytes
|
281 |
-
total_mem = mem_info.total / (1024 ** 3) # Convert to gigabytes
|
282 |
-
logger.info(
|
283 |
-
|
284 |
-
logger.info(f"CPU Usage after model 2 loaded: {cpu_percent}%")
|
285 |
-
# print(f"CPU Usage after model 2 loaded: {cpu_percent}%")
|
286 |
-
# del reasoning_model
|
287 |
-
cpu_percent = psutil.cpu_percent()
|
288 |
# mem_info = psutil.virtual_memory()
|
289 |
# used_mem = mem_info.used / (1024 ** 3) # Convert to gigabytes
|
290 |
# total_mem = mem_info.total / (1024 ** 3) # Convert to gigabytes
|
@@ -295,20 +365,40 @@ class EndpointHandler():
|
|
295 |
question_model = QuestionModel(
|
296 |
self.device, self.tokenizer, self.input_builder)
|
297 |
question_model.run_inference(transcript)
|
298 |
-
cpu_percent = psutil.cpu_percent()
|
299 |
-
logger.info(f"CPU Usage after model 3 loaded: {cpu_percent}%")
|
300 |
-
mem_info = psutil.virtual_memory()
|
301 |
-
used_mem = mem_info.used / (1024 ** 3) # Convert to gigabytes
|
302 |
-
total_mem = mem_info.total / (1024 ** 3) # Convert to gigabytes
|
303 |
-
logger.info(
|
304 |
-
|
305 |
# print(f"CPU Usage after model 3 loaded: {cpu_percent}%")
|
306 |
# del question_model
|
307 |
-
cpu_percent = psutil.cpu_percent()
|
308 |
# logger.info(f"CPU Usage after model 3 deleted: {cpu_percent}%")
|
309 |
# mem_info = psutil.virtual_memory()
|
310 |
# used_mem = mem_info.used / (1024 ** 3) # Convert to gigabytes
|
311 |
# total_mem = mem_info.total / (1024 ** 3) # Convert to gigabytes
|
312 |
# logger.info(f"Used Memory after model 3 deleted: {used_mem:.2f} GB, Total Mem: {total_mem:.2f} GB")
|
313 |
# print(f"CPU Usage after model 3 deleted: {cpu_percent}%")
|
314 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
import transformers
|
11 |
from transformers import BertTokenizer, BertForSequenceClassification
|
|
|
12 |
from transformers.utils import logging
|
13 |
from edu_toolkit import language_analysis
|
14 |
|
|
|
29 |
self.endtime = endtime
|
30 |
self.transcript = weakref.ref(transcript) if transcript else None
|
31 |
self.props = kwargs
|
32 |
+
self.role = None
|
33 |
+
self.word_count = self.get_num_words(text)
|
34 |
+
self.timestamp = [starttime, endtime]
|
35 |
+
self.unit_measure = endtime - starttime
|
36 |
+
self.aggregate_unit_measure = endtime
|
37 |
|
38 |
+
# moments
|
39 |
self.uptake = None
|
40 |
self.reasoning = None
|
41 |
self.question = None
|
|
|
61 |
**self.props
|
62 |
}
|
63 |
|
64 |
+
def to_talk_timeline_dict(self):
|
65 |
+
return{
|
66 |
+
'speaker': self.speaker,
|
67 |
+
'text': self.text,
|
68 |
+
'role': self.role,
|
69 |
+
'timestamp': self.timestamp,
|
70 |
+
'moments': {'reasoning': self.reasoning, 'questioning': self.question, 'uptake': self.uptake},
|
71 |
+
'unitMeasure': self.unit_measure,
|
72 |
+
'aggregateUnitMeasure': self.aggregate_unit_measure,
|
73 |
+
'wordCount': self.word_count
|
74 |
+
}
|
75 |
+
|
76 |
def __repr__(self):
|
77 |
return f"Utterance(speaker='{self.speaker}'," \
|
78 |
f"text='{self.text}', uid={self.uid}," \
|
|
|
102 |
def length(self):
|
103 |
return len(self.utterances)
|
104 |
|
105 |
+
def update_utterance_roles(self, uptake_speaker):
|
106 |
+
for utt in self.utterances:
|
107 |
+
if (utt.speaker == uptake_speaker):
|
108 |
+
utt.role = 'teacher'
|
109 |
+
else:
|
110 |
+
utt.role = 'student'
|
111 |
+
|
112 |
+
def get_talk_distribution_and_length(self, uptake_speaker):
|
113 |
+
if ((uptake_speaker is None)):
|
114 |
+
return None
|
115 |
+
teacher_words = 0
|
116 |
+
student_words = 0
|
117 |
+
for utt in self.utterances:
|
118 |
+
if (utt.speaker == uptake_speaker):
|
119 |
+
utt.role = 'teacher'
|
120 |
+
teacher_words += utt.get_num_words()
|
121 |
+
else:
|
122 |
+
utt.role = 'student'
|
123 |
+
student_words += utt.get_num_words()
|
124 |
+
teacher_percentage = round(
|
125 |
+
(teacher_words / (teacher_words + student_words)) * 100)
|
126 |
+
student_percentage = 100 - teacher_percentage
|
127 |
+
return {'talk_distribution': {'teacher': teacher_percentage, 'student': student_percentage}}, {'talk_length': {'teacher': teacher_words, 'student': student_words}}
|
128 |
+
|
129 |
+
def get_word_cloud_dicts(self):
|
130 |
+
teacher_dict = {}
|
131 |
+
student_dict = {}
|
132 |
+
for utt in self.utterances.get_clean_text():
|
133 |
+
words = (utt.get_clean_text(remove_punct=True)).split(' ')
|
134 |
+
for word in words:
|
135 |
+
if utt.role == 'teacher':
|
136 |
+
if word not in teacher_dict:
|
137 |
+
teacher_dict[word] = 0
|
138 |
+
teacher_dict[word] += 1
|
139 |
+
else:
|
140 |
+
if word not in student_dict:
|
141 |
+
student_dict[word] = 0
|
142 |
+
student_dict[word] += 1
|
143 |
+
dict_list = []
|
144 |
+
for word in teacher_dict.keys():
|
145 |
+
dict_list.append(
|
146 |
+
{'text': word, 'value': teacher_dict[word], 'category': 'teacher'})
|
147 |
+
for word in student_dict.keys():
|
148 |
+
dict_list.append(
|
149 |
+
{'text': word, 'value': student_dict[word], 'category': 'student'})
|
150 |
+
return dict_list
|
151 |
+
|
152 |
+
def get_talk_timeline(self):
|
153 |
+
return [utterance.to_talk_timeline_dict() for utterance in self.utterances]
|
154 |
+
|
155 |
def to_dict(self):
|
156 |
return {
|
157 |
'utterances': [utterance.to_dict() for utterance in self.utterances],
|
|
|
310 |
transcript.add_utterance(Utterance(**utt))
|
311 |
|
312 |
print("Running inference on %d examples..." % transcript.length())
|
313 |
+
# cpu_percent = psutil.cpu_percent()
|
314 |
logging.set_verbosity_info()
|
315 |
+
# logger = logging.get_logger("transformers")
|
316 |
+
# logger.info(f"CPU Usage before models loaded: {cpu_percent}%")
|
317 |
+
# mem_info = psutil.virtual_memory()
|
318 |
+
# used_mem = mem_info.used / (1024 ** 3) # Convert to gigabytes
|
319 |
+
# total_mem = mem_info.total / (1024 ** 3) # Convert to gigabytes
|
320 |
+
# logger.info(
|
321 |
+
# f"Used Memory before models loaded: {used_mem:.2f} GB, Total RAM: {total_mem:.2f} GB")
|
322 |
+
|
323 |
# Uptake
|
324 |
uptake_model = UptakeModel(
|
325 |
self.device, self.tokenizer, self.input_builder)
|
326 |
+
uptake_speaker = params.pop("uptake_speaker", None)
|
327 |
uptake_model.run_inference(transcript, min_prev_words=params['uptake_min_num_words'],
|
328 |
+
uptake_speaker=uptake_speaker)
|
329 |
+
|
330 |
+
# cpu_percent = psutil.cpu_percent()
|
331 |
+
# mem_info = psutil.virtual_memory()
|
332 |
+
# used_mem = mem_info.used / (1024 ** 3) # Convert to gigabytes
|
333 |
+
# total_mem = mem_info.total / (1024 ** 3) # Convert to gigabytes
|
334 |
+
# logger.info(
|
335 |
+
# f"Used Memory after model 1 loaded: {used_mem:.2f} GB, Total Mem: {total_mem:.2f} GB")
|
336 |
+
# logger.info(f"CPU Usage after model 1 loaded: {cpu_percent}%")
|
337 |
# del uptake_model
|
338 |
# cpu_percent = psutil.cpu_percent()
|
339 |
# mem_info = psutil.virtual_memory()
|
|
|
345 |
reasoning_model = ReasoningModel(
|
346 |
self.device, self.tokenizer, self.input_builder)
|
347 |
reasoning_model.run_inference(transcript)
|
348 |
+
# cpu_percent = psutil.cpu_percent()
|
349 |
+
# mem_info = psutil.virtual_memory()
|
350 |
+
# used_mem = mem_info.used / (1024 ** 3) # Convert to gigabytes
|
351 |
+
# total_mem = mem_info.total / (1024 ** 3) # Convert to gigabytes
|
352 |
+
# logger.info(
|
353 |
+
# f"Used Memory after model 2 loaded: {used_mem:.2f} GB, Total Mem: {total_mem:.2f} GB")
|
354 |
+
# logger.info(f"CPU Usage after model 2 loaded: {cpu_percent}%")
|
355 |
+
# # print(f"CPU Usage after model 2 loaded: {cpu_percent}%")
|
356 |
+
# # del reasoning_model
|
357 |
+
# cpu_percent = psutil.cpu_percent()
|
358 |
# mem_info = psutil.virtual_memory()
|
359 |
# used_mem = mem_info.used / (1024 ** 3) # Convert to gigabytes
|
360 |
# total_mem = mem_info.total / (1024 ** 3) # Convert to gigabytes
|
|
|
365 |
question_model = QuestionModel(
|
366 |
self.device, self.tokenizer, self.input_builder)
|
367 |
question_model.run_inference(transcript)
|
368 |
+
# cpu_percent = psutil.cpu_percent()
|
369 |
+
# logger.info(f"CPU Usage after model 3 loaded: {cpu_percent}%")
|
370 |
+
# mem_info = psutil.virtual_memory()
|
371 |
+
# used_mem = mem_info.used / (1024 ** 3) # Convert to gigabytes
|
372 |
+
# total_mem = mem_info.total / (1024 ** 3) # Convert to gigabytes
|
373 |
+
# logger.info(
|
374 |
+
# f"Used Memory after model 3 loaded: {used_mem:.2f} GB, Total Mem: {total_mem:.2f} GB")
|
375 |
# print(f"CPU Usage after model 3 loaded: {cpu_percent}%")
|
376 |
# del question_model
|
377 |
+
# cpu_percent = psutil.cpu_percent()
|
378 |
# logger.info(f"CPU Usage after model 3 deleted: {cpu_percent}%")
|
379 |
# mem_info = psutil.virtual_memory()
|
380 |
# used_mem = mem_info.used / (1024 ** 3) # Convert to gigabytes
|
381 |
# total_mem = mem_info.total / (1024 ** 3) # Convert to gigabytes
|
382 |
# logger.info(f"Used Memory after model 3 deleted: {used_mem:.2f} GB, Total Mem: {total_mem:.2f} GB")
|
383 |
# print(f"CPU Usage after model 3 deleted: {cpu_percent}%")
|
384 |
+
transcript.update_utterance_roles
|
385 |
+
talk_dist, talk_len = transcript.get_talk_distribution_and_length(
|
386 |
+
self, uptake_speaker)
|
387 |
+
talk_timeline = transcript.get_talk_timeline()
|
388 |
+
word_cloud = transcript.get_word_cloud_dicts()
|
389 |
+
|
390 |
+
return transcript.to_dict(), talk_dist, talk_len, talk_timeline, word_cloud
|
391 |
+
|
392 |
+
|
393 |
+
# {
|
394 |
+
# "inputs": [
|
395 |
+
# {"uid": "1", "speaker": "Alice", "text": "How much is the fish?" },
|
396 |
+
# {"uid": "2", "speaker": "Bob", "text": "I do not know about the fish. Because you put a long side and it’s a long side. What do you think." },
|
397 |
+
# {"uid": "3", "speaker": "Alice", "text": "OK, thank you Bob." }
|
398 |
+
# ],
|
399 |
+
# "parameters": {
|
400 |
+
# "uptake_min_num_words": 5,
|
401 |
+
# "uptake_speaker": "Bob",
|
402 |
+
# "filename": "sample.csv"
|
403 |
+
# }
|
404 |
+
# }
|