hylee commited on
Commit
f8d71c4
·
1 Parent(s): 8d1633a

add new output features

Browse files
Files changed (1) hide show
  1. handler.py +126 -36
handler.py CHANGED
@@ -9,7 +9,6 @@ from utils import MultiHeadModel, BertInputBuilder, get_num_words
9
 
10
  import transformers
11
  from transformers import BertTokenizer, BertForSequenceClassification
12
- import psutil
13
  from transformers.utils import logging
14
  from edu_toolkit import language_analysis
15
 
@@ -30,7 +29,13 @@ class Utterance:
30
  self.endtime = endtime
31
  self.transcript = weakref.ref(transcript) if transcript else None
32
  self.props = kwargs
 
 
 
 
 
33
 
 
34
  self.uptake = None
35
  self.reasoning = None
36
  self.question = None
@@ -56,6 +61,18 @@ class Utterance:
56
  **self.props
57
  }
58
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  def __repr__(self):
60
  return f"Utterance(speaker='{self.speaker}'," \
61
  f"text='{self.text}', uid={self.uid}," \
@@ -85,6 +102,56 @@ class Transcript:
85
  def length(self):
86
  return len(self.utterances)
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  def to_dict(self):
89
  return {
90
  'utterances': [utterance.to_dict() for utterance in self.utterances],
@@ -243,27 +310,30 @@ class EndpointHandler():
243
  transcript.add_utterance(Utterance(**utt))
244
 
245
  print("Running inference on %d examples..." % transcript.length())
246
- cpu_percent = psutil.cpu_percent()
247
  logging.set_verbosity_info()
248
- logger = logging.get_logger("transformers")
249
- logger.info(f"CPU Usage before models loaded: {cpu_percent}%")
250
- mem_info = psutil.virtual_memory()
251
- used_mem = mem_info.used / (1024 ** 3) # Convert to gigabytes
252
- total_mem = mem_info.total / (1024 ** 3) # Convert to gigabytes
253
- logger.info(
254
- f"Used Memory before models loaded: {used_mem:.2f} GB, Total RAM: {total_mem:.2f} GB")
 
255
  # Uptake
256
  uptake_model = UptakeModel(
257
  self.device, self.tokenizer, self.input_builder)
 
258
  uptake_model.run_inference(transcript, min_prev_words=params['uptake_min_num_words'],
259
- uptake_speaker=params.pop("uptake_speaker", None))
260
- cpu_percent = psutil.cpu_percent()
261
- mem_info = psutil.virtual_memory()
262
- used_mem = mem_info.used / (1024 ** 3) # Convert to gigabytes
263
- total_mem = mem_info.total / (1024 ** 3) # Convert to gigabytes
264
- logger.info(
265
- f"Used Memory after model 1 loaded: {used_mem:.2f} GB, Total Mem: {total_mem:.2f} GB")
266
- logger.info(f"CPU Usage after model 1 loaded: {cpu_percent}%")
 
267
  # del uptake_model
268
  # cpu_percent = psutil.cpu_percent()
269
  # mem_info = psutil.virtual_memory()
@@ -275,16 +345,16 @@ class EndpointHandler():
275
  reasoning_model = ReasoningModel(
276
  self.device, self.tokenizer, self.input_builder)
277
  reasoning_model.run_inference(transcript)
278
- cpu_percent = psutil.cpu_percent()
279
- mem_info = psutil.virtual_memory()
280
- used_mem = mem_info.used / (1024 ** 3) # Convert to gigabytes
281
- total_mem = mem_info.total / (1024 ** 3) # Convert to gigabytes
282
- logger.info(
283
- f"Used Memory after model 2 loaded: {used_mem:.2f} GB, Total Mem: {total_mem:.2f} GB")
284
- logger.info(f"CPU Usage after model 2 loaded: {cpu_percent}%")
285
- # print(f"CPU Usage after model 2 loaded: {cpu_percent}%")
286
- # del reasoning_model
287
- cpu_percent = psutil.cpu_percent()
288
  # mem_info = psutil.virtual_memory()
289
  # used_mem = mem_info.used / (1024 ** 3) # Convert to gigabytes
290
  # total_mem = mem_info.total / (1024 ** 3) # Convert to gigabytes
@@ -295,20 +365,40 @@ class EndpointHandler():
295
  question_model = QuestionModel(
296
  self.device, self.tokenizer, self.input_builder)
297
  question_model.run_inference(transcript)
298
- cpu_percent = psutil.cpu_percent()
299
- logger.info(f"CPU Usage after model 3 loaded: {cpu_percent}%")
300
- mem_info = psutil.virtual_memory()
301
- used_mem = mem_info.used / (1024 ** 3) # Convert to gigabytes
302
- total_mem = mem_info.total / (1024 ** 3) # Convert to gigabytes
303
- logger.info(
304
- f"Used Memory after model 3 loaded: {used_mem:.2f} GB, Total Mem: {total_mem:.2f} GB")
305
  # print(f"CPU Usage after model 3 loaded: {cpu_percent}%")
306
  # del question_model
307
- cpu_percent = psutil.cpu_percent()
308
  # logger.info(f"CPU Usage after model 3 deleted: {cpu_percent}%")
309
  # mem_info = psutil.virtual_memory()
310
  # used_mem = mem_info.used / (1024 ** 3) # Convert to gigabytes
311
  # total_mem = mem_info.total / (1024 ** 3) # Convert to gigabytes
312
  # logger.info(f"Used Memory after model 3 deleted: {used_mem:.2f} GB, Total Mem: {total_mem:.2f} GB")
313
  # print(f"CPU Usage after model 3 deleted: {cpu_percent}%")
314
- return transcript.to_dict()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  import transformers
11
  from transformers import BertTokenizer, BertForSequenceClassification
 
12
  from transformers.utils import logging
13
  from edu_toolkit import language_analysis
14
 
 
29
  self.endtime = endtime
30
  self.transcript = weakref.ref(transcript) if transcript else None
31
  self.props = kwargs
32
+ self.role = None
33
+ self.word_count = self.get_num_words(text)
34
+ self.timestamp = [starttime, endtime]
35
+ self.unit_measure = endtime - starttime
36
+ self.aggregate_unit_measure = endtime
37
 
38
+ # moments
39
  self.uptake = None
40
  self.reasoning = None
41
  self.question = None
 
61
  **self.props
62
  }
63
 
64
+ def to_talk_timeline_dict(self):
65
+ return{
66
+ 'speaker': self.speaker,
67
+ 'text': self.text,
68
+ 'role': self.role,
69
+ 'timestamp': self.timestamp,
70
+ 'moments': {'reasoning': self.reasoning, 'questioning': self.question, 'uptake': self.uptake},
71
+ 'unitMeasure': self.unit_measure,
72
+ 'aggregateUnitMeasure': self.aggregate_unit_measure,
73
+ 'wordCount': self.word_count
74
+ }
75
+
76
  def __repr__(self):
77
  return f"Utterance(speaker='{self.speaker}'," \
78
  f"text='{self.text}', uid={self.uid}," \
 
102
  def length(self):
103
  return len(self.utterances)
104
 
105
+ def update_utterance_roles(self, uptake_speaker):
106
+ for utt in self.utterances:
107
+ if (utt.speaker == uptake_speaker):
108
+ utt.role = 'teacher'
109
+ else:
110
+ utt.role = 'student'
111
+
112
+ def get_talk_distribution_and_length(self, uptake_speaker):
113
+ if ((uptake_speaker is None)):
114
+ return None
115
+ teacher_words = 0
116
+ student_words = 0
117
+ for utt in self.utterances:
118
+ if (utt.speaker == uptake_speaker):
119
+ utt.role = 'teacher'
120
+ teacher_words += utt.get_num_words()
121
+ else:
122
+ utt.role = 'student'
123
+ student_words += utt.get_num_words()
124
+ teacher_percentage = round(
125
+ (teacher_words / (teacher_words + student_words)) * 100)
126
+ student_percentage = 100 - teacher_percentage
127
+ return {'talk_distribution': {'teacher': teacher_percentage, 'student': student_percentage}}, {'talk_length': {'teacher': teacher_words, 'student': student_words}}
128
+
129
+ def get_word_cloud_dicts(self):
130
+ teacher_dict = {}
131
+ student_dict = {}
132
+ for utt in self.utterances.get_clean_text():
133
+ words = (utt.get_clean_text(remove_punct=True)).split(' ')
134
+ for word in words:
135
+ if utt.role == 'teacher':
136
+ if word not in teacher_dict:
137
+ teacher_dict[word] = 0
138
+ teacher_dict[word] += 1
139
+ else:
140
+ if word not in student_dict:
141
+ student_dict[word] = 0
142
+ student_dict[word] += 1
143
+ dict_list = []
144
+ for word in teacher_dict.keys():
145
+ dict_list.append(
146
+ {'text': word, 'value': teacher_dict[word], 'category': 'teacher'})
147
+ for word in student_dict.keys():
148
+ dict_list.append(
149
+ {'text': word, 'value': student_dict[word], 'category': 'student'})
150
+ return dict_list
151
+
152
+ def get_talk_timeline(self):
153
+ return [utterance.to_talk_timeline_dict() for utterance in self.utterances]
154
+
155
  def to_dict(self):
156
  return {
157
  'utterances': [utterance.to_dict() for utterance in self.utterances],
 
310
  transcript.add_utterance(Utterance(**utt))
311
 
312
  print("Running inference on %d examples..." % transcript.length())
313
+ # cpu_percent = psutil.cpu_percent()
314
  logging.set_verbosity_info()
315
+ # logger = logging.get_logger("transformers")
316
+ # logger.info(f"CPU Usage before models loaded: {cpu_percent}%")
317
+ # mem_info = psutil.virtual_memory()
318
+ # used_mem = mem_info.used / (1024 ** 3) # Convert to gigabytes
319
+ # total_mem = mem_info.total / (1024 ** 3) # Convert to gigabytes
320
+ # logger.info(
321
+ # f"Used Memory before models loaded: {used_mem:.2f} GB, Total RAM: {total_mem:.2f} GB")
322
+
323
  # Uptake
324
  uptake_model = UptakeModel(
325
  self.device, self.tokenizer, self.input_builder)
326
+ uptake_speaker = params.pop("uptake_speaker", None)
327
  uptake_model.run_inference(transcript, min_prev_words=params['uptake_min_num_words'],
328
+ uptake_speaker=uptake_speaker)
329
+
330
+ # cpu_percent = psutil.cpu_percent()
331
+ # mem_info = psutil.virtual_memory()
332
+ # used_mem = mem_info.used / (1024 ** 3) # Convert to gigabytes
333
+ # total_mem = mem_info.total / (1024 ** 3) # Convert to gigabytes
334
+ # logger.info(
335
+ # f"Used Memory after model 1 loaded: {used_mem:.2f} GB, Total Mem: {total_mem:.2f} GB")
336
+ # logger.info(f"CPU Usage after model 1 loaded: {cpu_percent}%")
337
  # del uptake_model
338
  # cpu_percent = psutil.cpu_percent()
339
  # mem_info = psutil.virtual_memory()
 
345
  reasoning_model = ReasoningModel(
346
  self.device, self.tokenizer, self.input_builder)
347
  reasoning_model.run_inference(transcript)
348
+ # cpu_percent = psutil.cpu_percent()
349
+ # mem_info = psutil.virtual_memory()
350
+ # used_mem = mem_info.used / (1024 ** 3) # Convert to gigabytes
351
+ # total_mem = mem_info.total / (1024 ** 3) # Convert to gigabytes
352
+ # logger.info(
353
+ # f"Used Memory after model 2 loaded: {used_mem:.2f} GB, Total Mem: {total_mem:.2f} GB")
354
+ # logger.info(f"CPU Usage after model 2 loaded: {cpu_percent}%")
355
+ # # print(f"CPU Usage after model 2 loaded: {cpu_percent}%")
356
+ # # del reasoning_model
357
+ # cpu_percent = psutil.cpu_percent()
358
  # mem_info = psutil.virtual_memory()
359
  # used_mem = mem_info.used / (1024 ** 3) # Convert to gigabytes
360
  # total_mem = mem_info.total / (1024 ** 3) # Convert to gigabytes
 
365
  question_model = QuestionModel(
366
  self.device, self.tokenizer, self.input_builder)
367
  question_model.run_inference(transcript)
368
+ # cpu_percent = psutil.cpu_percent()
369
+ # logger.info(f"CPU Usage after model 3 loaded: {cpu_percent}%")
370
+ # mem_info = psutil.virtual_memory()
371
+ # used_mem = mem_info.used / (1024 ** 3) # Convert to gigabytes
372
+ # total_mem = mem_info.total / (1024 ** 3) # Convert to gigabytes
373
+ # logger.info(
374
+ # f"Used Memory after model 3 loaded: {used_mem:.2f} GB, Total Mem: {total_mem:.2f} GB")
375
  # print(f"CPU Usage after model 3 loaded: {cpu_percent}%")
376
  # del question_model
377
+ # cpu_percent = psutil.cpu_percent()
378
  # logger.info(f"CPU Usage after model 3 deleted: {cpu_percent}%")
379
  # mem_info = psutil.virtual_memory()
380
  # used_mem = mem_info.used / (1024 ** 3) # Convert to gigabytes
381
  # total_mem = mem_info.total / (1024 ** 3) # Convert to gigabytes
382
  # logger.info(f"Used Memory after model 3 deleted: {used_mem:.2f} GB, Total Mem: {total_mem:.2f} GB")
383
  # print(f"CPU Usage after model 3 deleted: {cpu_percent}%")
384
+ transcript.update_utterance_roles
385
+ talk_dist, talk_len = transcript.get_talk_distribution_and_length(
386
+ self, uptake_speaker)
387
+ talk_timeline = transcript.get_talk_timeline()
388
+ word_cloud = transcript.get_word_cloud_dicts()
389
+
390
+ return transcript.to_dict(), talk_dist, talk_len, talk_timeline, word_cloud
391
+
392
+
393
+ # {
394
+ # "inputs": [
395
+ # {"uid": "1", "speaker": "Alice", "text": "How much is the fish?" },
396
+ # {"uid": "2", "speaker": "Bob", "text": "I do not know about the fish. Because you put a long side and it’s a long side. What do you think." },
397
+ # {"uid": "3", "speaker": "Alice", "text": "OK, thank you Bob." }
398
+ # ],
399
+ # "parameters": {
400
+ # "uptake_min_num_words": 5,
401
+ # "uptake_speaker": "Bob",
402
+ # "filename": "sample.csv"
403
+ # }
404
+ # }