add math word cloud data
Browse files- handler.py +16 -9
handler.py
CHANGED
@@ -145,13 +145,11 @@ class Transcript:
|
|
145 |
avg_student_length = student_words / student_utt_count
|
146 |
return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
|
147 |
|
148 |
-
def
|
149 |
teacher_dict = {}
|
150 |
student_dict = {}
|
151 |
uptake_teacher_dict = {}
|
152 |
stop_words = stopwords.words('english')
|
153 |
-
# stopwords = nltk.corpus.stopwords.word('english')
|
154 |
-
# print("stopwords: ", stopwords)
|
155 |
for utt in self.utterances:
|
156 |
words = (utt.get_clean_text(remove_punct=True)).split(' ')
|
157 |
for word in words:
|
@@ -370,9 +368,10 @@ def load_math_terms():
|
|
370 |
math_terms_dict[term] = term
|
371 |
return math_terms, math_terms_dict
|
372 |
|
373 |
-
def run_math_density(transcript):
|
374 |
math_terms, math_terms_dict = load_math_terms()
|
375 |
sorted_terms = sorted(math_terms, key=len, reverse=True)
|
|
|
376 |
for i, utt in enumerate(transcript.utterances):
|
377 |
text = utt.get_clean_text(remove_punct=False)
|
378 |
num_matches = 0
|
@@ -383,12 +382,21 @@ def run_math_density(transcript):
|
|
383 |
# Filter out matches that share positions with longer terms
|
384 |
matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
|
385 |
if len(matches) > 0:
|
|
|
|
|
|
|
386 |
match_list.append(math_terms_dict[term])
|
387 |
# Update matched positions
|
388 |
matched_positions.update((match.start(), match.end()) for match in matches)
|
389 |
num_matches += len(matches)
|
390 |
utt.num_math_terms = num_matches
|
391 |
utt.math_terms = match_list
|
|
|
|
|
|
|
|
|
|
|
|
|
392 |
|
393 |
class EndpointHandler():
|
394 |
def __init__(self, path="."):
|
@@ -442,19 +450,18 @@ class EndpointHandler():
|
|
442 |
self.device, self.tokenizer, self.input_builder)
|
443 |
focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
|
444 |
|
445 |
-
run_math_density(transcript)
|
446 |
-
|
447 |
transcript.update_utterance_roles(uptake_speaker)
|
448 |
transcript.calculate_aggregate_word_count()
|
449 |
-
return_dict = {'talkDistribution': None, 'talkLength': None, 'talkMoments': None, 'commonTopWords': None, 'uptakeTopWords': None}
|
450 |
talk_dist, talk_len = transcript.get_talk_distribution_and_length(uptake_speaker)
|
451 |
return_dict['talkDistribution'] = talk_dist
|
452 |
return_dict['talkLength'] = talk_len
|
453 |
talk_moments = transcript.get_talk_timeline()
|
454 |
return_dict['talkMoments'] = talk_moments
|
455 |
-
word_cloud, uptake_word_cloud = transcript.
|
456 |
return_dict['commonTopWords'] = word_cloud
|
457 |
return_dict['uptakeTopWords'] = uptake_word_cloud
|
458 |
-
|
459 |
|
460 |
return return_dict
|
|
|
145 |
avg_student_length = student_words / student_utt_count
|
146 |
return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
|
147 |
|
148 |
+
def get_uptake_and_speaker_word_cloud_dicts(self):
|
149 |
teacher_dict = {}
|
150 |
student_dict = {}
|
151 |
uptake_teacher_dict = {}
|
152 |
stop_words = stopwords.words('english')
|
|
|
|
|
153 |
for utt in self.utterances:
|
154 |
words = (utt.get_clean_text(remove_punct=True)).split(' ')
|
155 |
for word in words:
|
|
|
368 |
math_terms_dict[term] = term
|
369 |
return math_terms, math_terms_dict
|
370 |
|
371 |
+
def run_math_density(transcript, uptake_speaker=None):
|
372 |
math_terms, math_terms_dict = load_math_terms()
|
373 |
sorted_terms = sorted(math_terms, key=len, reverse=True)
|
374 |
+
math_word_cloud = {}
|
375 |
for i, utt in enumerate(transcript.utterances):
|
376 |
text = utt.get_clean_text(remove_punct=False)
|
377 |
num_matches = 0
|
|
|
382 |
# Filter out matches that share positions with longer terms
|
383 |
matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
|
384 |
if len(matches) > 0:
|
385 |
+
if math_terms_dict[term] not in math_word_cloud:
|
386 |
+
math_word_cloud[math_terms_dict[term]] = (0, 'teacher' if utt.speaker == uptake_speaker else 'student')
|
387 |
+
math_word_cloud[math_terms_dict[term]][0] += len(matches)
|
388 |
match_list.append(math_terms_dict[term])
|
389 |
# Update matched positions
|
390 |
matched_positions.update((match.start(), match.end()) for match in matches)
|
391 |
num_matches += len(matches)
|
392 |
utt.num_math_terms = num_matches
|
393 |
utt.math_terms = match_list
|
394 |
+
dict_list = []
|
395 |
+
for word in math_word_cloud.keys():
|
396 |
+
dict_list.append(
|
397 |
+
{'text': word, 'value': math_word_cloud[word][0], 'category': math_word_cloud[word][1]})
|
398 |
+
sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
|
399 |
+
return sorted_dict_list[:50]
|
400 |
|
401 |
class EndpointHandler():
|
402 |
def __init__(self, path="."):
|
|
|
450 |
self.device, self.tokenizer, self.input_builder)
|
451 |
focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
|
452 |
|
453 |
+
math_cloud = run_math_density(transcript, uptake_speaker=uptake_speaker)
|
|
|
454 |
transcript.update_utterance_roles(uptake_speaker)
|
455 |
transcript.calculate_aggregate_word_count()
|
456 |
+
return_dict = {'talkDistribution': None, 'talkLength': None, 'talkMoments': None, 'commonTopWords': None, 'uptakeTopWords': None, 'mathTopWords': None}
|
457 |
talk_dist, talk_len = transcript.get_talk_distribution_and_length(uptake_speaker)
|
458 |
return_dict['talkDistribution'] = talk_dist
|
459 |
return_dict['talkLength'] = talk_len
|
460 |
talk_moments = transcript.get_talk_timeline()
|
461 |
return_dict['talkMoments'] = talk_moments
|
462 |
+
word_cloud, uptake_word_cloud = transcript.get_uptake_and_speaker_word_cloud_dicts(math_cloud_dict)
|
463 |
return_dict['commonTopWords'] = word_cloud
|
464 |
return_dict['uptakeTopWords'] = uptake_word_cloud
|
465 |
+
return_dict['mathTopWords'] = math_cloud
|
466 |
|
467 |
return return_dict
|