hylee719 commited on
Commit
ce5d993
·
1 Parent(s): 8967406

add math word cloud data

Browse files
Files changed (1) hide show
  1. handler.py +16 -9
handler.py CHANGED
@@ -145,13 +145,11 @@ class Transcript:
145
  avg_student_length = student_words / student_utt_count
146
  return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
147
 
148
- def get_word_cloud_dicts(self):
149
  teacher_dict = {}
150
  student_dict = {}
151
  uptake_teacher_dict = {}
152
  stop_words = stopwords.words('english')
153
- # stopwords = nltk.corpus.stopwords.word('english')
154
- # print("stopwords: ", stopwords)
155
  for utt in self.utterances:
156
  words = (utt.get_clean_text(remove_punct=True)).split(' ')
157
  for word in words:
@@ -370,9 +368,10 @@ def load_math_terms():
370
  math_terms_dict[term] = term
371
  return math_terms, math_terms_dict
372
 
373
- def run_math_density(transcript):
374
  math_terms, math_terms_dict = load_math_terms()
375
  sorted_terms = sorted(math_terms, key=len, reverse=True)
 
376
  for i, utt in enumerate(transcript.utterances):
377
  text = utt.get_clean_text(remove_punct=False)
378
  num_matches = 0
@@ -383,12 +382,21 @@ def run_math_density(transcript):
383
  # Filter out matches that share positions with longer terms
384
  matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
385
  if len(matches) > 0:
 
 
 
386
  match_list.append(math_terms_dict[term])
387
  # Update matched positions
388
  matched_positions.update((match.start(), match.end()) for match in matches)
389
  num_matches += len(matches)
390
  utt.num_math_terms = num_matches
391
  utt.math_terms = match_list
 
 
 
 
 
 
392
 
393
  class EndpointHandler():
394
  def __init__(self, path="."):
@@ -442,19 +450,18 @@ class EndpointHandler():
442
  self.device, self.tokenizer, self.input_builder)
443
  focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
444
 
445
- run_math_density(transcript)
446
-
447
  transcript.update_utterance_roles(uptake_speaker)
448
  transcript.calculate_aggregate_word_count()
449
- return_dict = {'talkDistribution': None, 'talkLength': None, 'talkMoments': None, 'commonTopWords': None, 'uptakeTopWords': None}
450
  talk_dist, talk_len = transcript.get_talk_distribution_and_length(uptake_speaker)
451
  return_dict['talkDistribution'] = talk_dist
452
  return_dict['talkLength'] = talk_len
453
  talk_moments = transcript.get_talk_timeline()
454
  return_dict['talkMoments'] = talk_moments
455
- word_cloud, uptake_word_cloud = transcript.get_word_cloud_dicts()
456
  return_dict['commonTopWords'] = word_cloud
457
  return_dict['uptakeTopWords'] = uptake_word_cloud
458
-
459
 
460
  return return_dict
 
145
  avg_student_length = student_words / student_utt_count
146
  return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
147
 
148
+ def get_uptake_and_speaker_word_cloud_dicts(self):
149
  teacher_dict = {}
150
  student_dict = {}
151
  uptake_teacher_dict = {}
152
  stop_words = stopwords.words('english')
 
 
153
  for utt in self.utterances:
154
  words = (utt.get_clean_text(remove_punct=True)).split(' ')
155
  for word in words:
 
368
  math_terms_dict[term] = term
369
  return math_terms, math_terms_dict
370
 
371
+ def run_math_density(transcript, uptake_speaker=None):
372
  math_terms, math_terms_dict = load_math_terms()
373
  sorted_terms = sorted(math_terms, key=len, reverse=True)
374
+ math_word_cloud = {}
375
  for i, utt in enumerate(transcript.utterances):
376
  text = utt.get_clean_text(remove_punct=False)
377
  num_matches = 0
 
382
  # Filter out matches that share positions with longer terms
383
  matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
384
  if len(matches) > 0:
385
+ if math_terms_dict[term] not in math_word_cloud:
386
+ math_word_cloud[math_terms_dict[term]] = (0, 'teacher' if utt.speaker == uptake_speaker else 'student')
387
+ math_word_cloud[math_terms_dict[term]][0] += len(matches)
388
  match_list.append(math_terms_dict[term])
389
  # Update matched positions
390
  matched_positions.update((match.start(), match.end()) for match in matches)
391
  num_matches += len(matches)
392
  utt.num_math_terms = num_matches
393
  utt.math_terms = match_list
394
+ dict_list = []
395
+ for word in math_word_cloud.keys():
396
+ dict_list.append(
397
+ {'text': word, 'value': math_word_cloud[word][0], 'category': math_word_cloud[word][1]})
398
+ sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
399
+ return sorted_dict_list[:50]
400
 
401
  class EndpointHandler():
402
  def __init__(self, path="."):
 
450
  self.device, self.tokenizer, self.input_builder)
451
  focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
452
 
453
+ math_cloud = run_math_density(transcript, uptake_speaker=uptake_speaker)
 
454
  transcript.update_utterance_roles(uptake_speaker)
455
  transcript.calculate_aggregate_word_count()
456
+ return_dict = {'talkDistribution': None, 'talkLength': None, 'talkMoments': None, 'commonTopWords': None, 'uptakeTopWords': None, 'mathTopWords': None}
457
  talk_dist, talk_len = transcript.get_talk_distribution_and_length(uptake_speaker)
458
  return_dict['talkDistribution'] = talk_dist
459
  return_dict['talkLength'] = talk_len
460
  talk_moments = transcript.get_talk_timeline()
461
  return_dict['talkMoments'] = talk_moments
462
+ word_cloud, uptake_word_cloud = transcript.get_uptake_and_speaker_word_cloud_dicts(math_cloud_dict)
463
  return_dict['commonTopWords'] = word_cloud
464
  return_dict['uptakeTopWords'] = uptake_word_cloud
465
+ return_dict['mathTopWords'] = math_cloud
466
 
467
  return return_dict