hylee719
/

transcript-analysis-testing

Model card Files Files and versions Community

hylee719 commited on Nov 29, 2023

Commit

ce5d993

1 Parent(s): 8967406

add math word cloud data

Browse files

Files changed (1) hide show

handler.py +16 -9

handler.py CHANGED Viewed

@@ -145,13 +145,11 @@ class Transcript:
         avg_student_length = student_words / student_utt_count
         return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
-    def get_word_cloud_dicts(self):
         teacher_dict = {}
         student_dict = {}
         uptake_teacher_dict = {}
         stop_words = stopwords.words('english')
-        # stopwords = nltk.corpus.stopwords.word('english')
-        # print("stopwords: ", stopwords)
         for utt in self.utterances:
             words = (utt.get_clean_text(remove_punct=True)).split(' ')
             for word in words:
@@ -370,9 +368,10 @@ def load_math_terms():
             math_terms_dict[term] = term
     return math_terms, math_terms_dict
-def run_math_density(transcript):
     math_terms, math_terms_dict = load_math_terms()
     sorted_terms = sorted(math_terms, key=len, reverse=True)
     for i, utt in enumerate(transcript.utterances):
         text = utt.get_clean_text(remove_punct=False)
         num_matches = 0
@@ -383,12 +382,21 @@ def run_math_density(transcript):
             # Filter out matches that share positions with longer terms
             matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
             if len(matches) > 0:
                 match_list.append(math_terms_dict[term])
             # Update matched positions
             matched_positions.update((match.start(), match.end()) for match in matches)
             num_matches += len(matches)
         utt.num_math_terms = num_matches
         utt.math_terms = match_list
 class EndpointHandler():
     def __init__(self, path="."):
@@ -442,19 +450,18 @@ class EndpointHandler():
             self.device, self.tokenizer, self.input_builder)
         focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
-        run_math_density(transcript)
         transcript.update_utterance_roles(uptake_speaker)
         transcript.calculate_aggregate_word_count()
-        return_dict = {'talkDistribution': None, 'talkLength': None, 'talkMoments': None, 'commonTopWords': None, 'uptakeTopWords': None}
         talk_dist, talk_len = transcript.get_talk_distribution_and_length(uptake_speaker)
         return_dict['talkDistribution'] = talk_dist
         return_dict['talkLength'] = talk_len
         talk_moments = transcript.get_talk_timeline()
         return_dict['talkMoments'] = talk_moments
-        word_cloud, uptake_word_cloud = transcript.get_word_cloud_dicts()
         return_dict['commonTopWords'] = word_cloud
         return_dict['uptakeTopWords'] = uptake_word_cloud
         return return_dict

         avg_student_length = student_words / student_utt_count
         return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
+    def get_uptake_and_speaker_word_cloud_dicts(self):
         teacher_dict = {}
         student_dict = {}
         uptake_teacher_dict = {}
         stop_words = stopwords.words('english')
         for utt in self.utterances:
             words = (utt.get_clean_text(remove_punct=True)).split(' ')
             for word in words:
             math_terms_dict[term] = term
     return math_terms, math_terms_dict
+def run_math_density(transcript, uptake_speaker=None):
     math_terms, math_terms_dict = load_math_terms()
     sorted_terms = sorted(math_terms, key=len, reverse=True)
+    math_word_cloud = {}
     for i, utt in enumerate(transcript.utterances):
         text = utt.get_clean_text(remove_punct=False)
         num_matches = 0
             # Filter out matches that share positions with longer terms
             matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
             if len(matches) > 0:
+                if math_terms_dict[term] not in math_word_cloud:
+                    math_word_cloud[math_terms_dict[term]] = (0, 'teacher' if utt.speaker == uptake_speaker else 'student')
+                math_word_cloud[math_terms_dict[term]][0] += len(matches)
                 match_list.append(math_terms_dict[term])
             # Update matched positions
             matched_positions.update((match.start(), match.end()) for match in matches)
             num_matches += len(matches)
         utt.num_math_terms = num_matches
         utt.math_terms = match_list
+        dict_list = []
+        for word in math_word_cloud.keys():
+            dict_list.append(
+                {'text': word, 'value': math_word_cloud[word][0], 'category': math_word_cloud[word][1]})
+        sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
+        return sorted_dict_list[:50]
 class EndpointHandler():
     def __init__(self, path="."):
             self.device, self.tokenizer, self.input_builder)
         focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
+        math_cloud = run_math_density(transcript, uptake_speaker=uptake_speaker)
         transcript.update_utterance_roles(uptake_speaker)
         transcript.calculate_aggregate_word_count()
+        return_dict = {'talkDistribution': None, 'talkLength': None, 'talkMoments': None, 'commonTopWords': None, 'uptakeTopWords': None, 'mathTopWords': None}
         talk_dist, talk_len = transcript.get_talk_distribution_and_length(uptake_speaker)
         return_dict['talkDistribution'] = talk_dist
         return_dict['talkLength'] = talk_len
         talk_moments = transcript.get_talk_timeline()
         return_dict['talkMoments'] = talk_moments
+        word_cloud, uptake_word_cloud = transcript.get_uptake_and_speaker_word_cloud_dicts(math_cloud_dict)
         return_dict['commonTopWords'] = word_cloud
         return_dict['uptakeTopWords'] = uptake_word_cloud
+        return_dict['mathTopWords'] = math_cloud
         return return_dict