hylee commited on
Commit
4a18cf3
·
1 Parent(s): c2f7754

update with del

Browse files
Files changed (1) hide show
  1. handler.py +42 -27
handler.py CHANGED
@@ -36,8 +36,11 @@ class Utterance:
36
  self.role = None
37
  self.word_count = self.get_num_words()
38
  self.timestamp = [starttime, endtime]
39
- self.unit_measure = None
40
- self.aggregate_unit_measure = endtime
 
 
 
41
  self.num_math_terms = None
42
  self.math_terms = None
43
 
@@ -75,6 +78,7 @@ class Utterance:
75
  return{
76
  'speaker': self.speaker,
77
  'text': self.text,
 
78
  'role': self.role,
79
  'timestamp': self.timestamp,
80
  'moments': {'reasoning': True if self.reasoning else False, 'questioning': True if self.question else False, 'uptake': True if self.uptake else False, 'focusingQuestion': True if self.focusing_question else False},
@@ -137,20 +141,21 @@ class Transcript:
137
  utt.role = 'student'
138
  student_words += utt.get_num_words()
139
  student_utt_count += 1
140
- teacher_percentage = round(
141
- (teacher_words / (teacher_words + student_words)) * 100)
142
- student_percentage = 100 - teacher_percentage
143
- avg_teacher_length = teacher_words / teacher_utt_count
144
- avg_student_length = student_words / student_utt_count
 
 
 
145
  return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
146
 
147
- def get_word_cloud_dicts(self):
148
  teacher_dict = {}
149
  student_dict = {}
150
  uptake_teacher_dict = {}
151
  stop_words = stopwords.words('english')
152
- # stopwords = nltk.corpus.stopwords.word('english')
153
- # print("stopwords: ", stopwords)
154
  for utt in self.utterances:
155
  words = (utt.get_clean_text(remove_punct=True)).split(' ')
156
  for word in words:
@@ -227,7 +232,7 @@ class QuestionModel:
227
  max_length=self.max_length,
228
  input_str=True)
229
  output = self.get_prediction(instance)
230
- print(output)
231
  utt.question = np.argmax(
232
  output["is_question_logits"][0].tolist())
233
 
@@ -255,11 +260,11 @@ class ReasoningModel:
255
  self.model = BertForSequenceClassification.from_pretrained(path)
256
  self.model.to(self.device)
257
 
258
- def run_inference(self, transcript, min_num_words=8):
259
  self.model.eval()
260
  with torch.no_grad():
261
  for i, utt in enumerate(transcript.utterances):
262
- if utt.get_num_words() >= min_num_words:
263
  instance = self.input_builder.build_inputs([], utt.text,
264
  max_length=self.max_length,
265
  input_str=True)
@@ -372,6 +377,7 @@ def load_math_terms():
372
  def run_math_density(transcript):
373
  math_terms, math_terms_dict = load_math_terms()
374
  sorted_terms = sorted(math_terms, key=len, reverse=True)
 
375
  for i, utt in enumerate(transcript.utterances):
376
  text = utt.get_clean_text(remove_punct=False)
377
  num_matches = 0
@@ -382,12 +388,21 @@ def run_math_density(transcript):
382
  # Filter out matches that share positions with longer terms
383
  matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
384
  if len(matches) > 0:
 
 
 
385
  match_list.append(math_terms_dict[term])
386
  # Update matched positions
387
  matched_positions.update((match.start(), match.end()) for match in matches)
388
  num_matches += len(matches)
389
  utt.num_math_terms = num_matches
390
  utt.math_terms = match_list
 
 
 
 
 
 
391
 
392
  class EndpointHandler():
393
  def __init__(self, path="."):
@@ -410,10 +425,6 @@ class EndpointHandler():
410
  utterances = data.pop("inputs", data)
411
  params = data.pop("parameters", None)
412
 
413
- print("EXAMPLES")
414
- for utt in utterances[:3]:
415
- print("speaker %s: %s" % (utt["speaker"], utt["text"]))
416
-
417
  transcript = Transcript(filename=params.pop("filename", None))
418
  for utt in utterances:
419
  transcript.add_utterance(Utterance(**utt))
@@ -426,34 +437,38 @@ class EndpointHandler():
426
  uptake_speaker = params.pop("uptake_speaker", None)
427
  uptake_model.run_inference(transcript, min_prev_words=params['uptake_min_num_words'],
428
  uptake_speaker=uptake_speaker)
 
 
429
  # Reasoning
430
  reasoning_model = ReasoningModel(
431
  self.device, self.tokenizer, self.input_builder)
432
- reasoning_model.run_inference(transcript)
433
-
 
434
  # Question
435
  question_model = QuestionModel(
436
  self.device, self.tokenizer, self.input_builder)
437
  question_model.run_inference(transcript)
438
-
 
439
  # Focusing Question
440
  focusing_question_model = FocusingQuestionModel(
441
  self.device, self.tokenizer, self.input_builder)
442
  focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
443
-
444
- run_math_density(transcript)
445
-
446
  transcript.update_utterance_roles(uptake_speaker)
447
  transcript.calculate_aggregate_word_count()
448
- return_dict = {'talkDistribution': None, 'talkLength': None, 'talkMoments': None, 'commonTopWords': None, 'uptakeTopWords': None}
449
  talk_dist, talk_len = transcript.get_talk_distribution_and_length(uptake_speaker)
450
  return_dict['talkDistribution'] = talk_dist
451
  return_dict['talkLength'] = talk_len
452
  talk_moments = transcript.get_talk_timeline()
453
  return_dict['talkMoments'] = talk_moments
454
- word_cloud, uptake_word_cloud = transcript.get_word_cloud_dicts()
455
  return_dict['commonTopWords'] = word_cloud
456
- return_dict['uptakeTopwords'] = uptake_word_cloud
457
-
458
 
459
  return return_dict
 
36
  self.role = None
37
  self.word_count = self.get_num_words()
38
  self.timestamp = [starttime, endtime]
39
+ if starttime is not None and endtime is not None:
40
+ self.unit_measure = endtime - starttime
41
+ else:
42
+ self.unit_measure = None
43
+ self.aggregate_unit_measure = endtime
44
  self.num_math_terms = None
45
  self.math_terms = None
46
 
 
78
  return{
79
  'speaker': self.speaker,
80
  'text': self.text,
81
+ 'uid': self.uid,
82
  'role': self.role,
83
  'timestamp': self.timestamp,
84
  'moments': {'reasoning': True if self.reasoning else False, 'questioning': True if self.question else False, 'uptake': True if self.uptake else False, 'focusingQuestion': True if self.focusing_question else False},
 
141
  utt.role = 'student'
142
  student_words += utt.get_num_words()
143
  student_utt_count += 1
144
+ if teacher_words + student_words > 0:
145
+ teacher_percentage = round(
146
+ (teacher_words / (teacher_words + student_words)) * 100)
147
+ student_percentage = 100 - teacher_percentage
148
+ else:
149
+ teacher_percentage = student_percentage = 0
150
+ avg_teacher_length = teacher_words / teacher_utt_count if teacher_utt_count > 0 else 0
151
+ avg_student_length = student_words / student_utt_count if student_utt_count > 0 else 0
152
  return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
153
 
154
+ def get_word_clouds(self):
155
  teacher_dict = {}
156
  student_dict = {}
157
  uptake_teacher_dict = {}
158
  stop_words = stopwords.words('english')
 
 
159
  for utt in self.utterances:
160
  words = (utt.get_clean_text(remove_punct=True)).split(' ')
161
  for word in words:
 
232
  max_length=self.max_length,
233
  input_str=True)
234
  output = self.get_prediction(instance)
235
+ # print(output)
236
  utt.question = np.argmax(
237
  output["is_question_logits"][0].tolist())
238
 
 
260
  self.model = BertForSequenceClassification.from_pretrained(path)
261
  self.model.to(self.device)
262
 
263
+ def run_inference(self, transcript, min_num_words=8, uptake_speaker=None):
264
  self.model.eval()
265
  with torch.no_grad():
266
  for i, utt in enumerate(transcript.utterances):
267
+ if utt.get_num_words() >= min_num_words and utt.speaker != uptake_speaker:
268
  instance = self.input_builder.build_inputs([], utt.text,
269
  max_length=self.max_length,
270
  input_str=True)
 
377
  def run_math_density(transcript):
378
  math_terms, math_terms_dict = load_math_terms()
379
  sorted_terms = sorted(math_terms, key=len, reverse=True)
380
+ math_word_cloud = {}
381
  for i, utt in enumerate(transcript.utterances):
382
  text = utt.get_clean_text(remove_punct=False)
383
  num_matches = 0
 
388
  # Filter out matches that share positions with longer terms
389
  matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
390
  if len(matches) > 0:
391
+ if math_terms_dict[term] not in math_word_cloud:
392
+ math_word_cloud[math_terms_dict[term]] = 0
393
+ math_word_cloud[math_terms_dict[term]] += len(matches)
394
  match_list.append(math_terms_dict[term])
395
  # Update matched positions
396
  matched_positions.update((match.start(), match.end()) for match in matches)
397
  num_matches += len(matches)
398
  utt.num_math_terms = num_matches
399
  utt.math_terms = match_list
400
+ dict_list = []
401
+ for word in math_word_cloud.keys():
402
+ dict_list.append(
403
+ {'text': word, 'value': math_word_cloud[word]})
404
+ sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
405
+ return sorted_dict_list[:50]
406
 
407
  class EndpointHandler():
408
  def __init__(self, path="."):
 
425
  utterances = data.pop("inputs", data)
426
  params = data.pop("parameters", None)
427
 
 
 
 
 
428
  transcript = Transcript(filename=params.pop("filename", None))
429
  for utt in utterances:
430
  transcript.add_utterance(Utterance(**utt))
 
437
  uptake_speaker = params.pop("uptake_speaker", None)
438
  uptake_model.run_inference(transcript, min_prev_words=params['uptake_min_num_words'],
439
  uptake_speaker=uptake_speaker)
440
+ del uptake_model
441
+
442
  # Reasoning
443
  reasoning_model = ReasoningModel(
444
  self.device, self.tokenizer, self.input_builder)
445
+ reasoning_model.run_inference(transcript, uptake_speaker=uptake_speaker)
446
+ del reasoning_model
447
+
448
  # Question
449
  question_model = QuestionModel(
450
  self.device, self.tokenizer, self.input_builder)
451
  question_model.run_inference(transcript)
452
+ del question_model
453
+
454
  # Focusing Question
455
  focusing_question_model = FocusingQuestionModel(
456
  self.device, self.tokenizer, self.input_builder)
457
  focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
458
+ del focusing_question_model
459
+
460
+ math_cloud = run_math_density(transcript)
461
  transcript.update_utterance_roles(uptake_speaker)
462
  transcript.calculate_aggregate_word_count()
463
+ return_dict = {'talkDistribution': None, 'talkLength': None, 'talkMoments': None, 'commonTopWords': None, 'uptakeTopWords': None, 'mathTopWords': None}
464
  talk_dist, talk_len = transcript.get_talk_distribution_and_length(uptake_speaker)
465
  return_dict['talkDistribution'] = talk_dist
466
  return_dict['talkLength'] = talk_len
467
  talk_moments = transcript.get_talk_timeline()
468
  return_dict['talkMoments'] = talk_moments
469
+ word_cloud, uptake_word_cloud = transcript.get_word_clouds()
470
  return_dict['commonTopWords'] = word_cloud
471
+ return_dict['uptakeTopWords'] = uptake_word_cloud
472
+ return_dict['mathTopWords'] = math_cloud
473
 
474
  return return_dict