math-words/fix-word-matching
#2
by
ikarasz
- opened
- handler.py +17 -17
handler.py
CHANGED
@@ -374,20 +374,20 @@ class FocusingQuestionModel:
|
|
374 |
return output
|
375 |
|
376 |
def load_math_terms():
|
377 |
-
|
378 |
math_terms_dict = {}
|
379 |
for term in MATH_WORDS:
|
380 |
if term in MATH_PREFIXES:
|
381 |
-
math_terms_dict[
|
382 |
-
|
383 |
else:
|
384 |
-
|
385 |
-
math_terms_dict[term] = term
|
386 |
-
return
|
387 |
|
388 |
def run_math_density(transcript):
|
389 |
-
|
390 |
-
|
391 |
teacher_math_word_cloud = {}
|
392 |
student_math_word_cloud = {}
|
393 |
for i, utt in enumerate(transcript.utterances):
|
@@ -395,21 +395,21 @@ def run_math_density(transcript):
|
|
395 |
num_matches = 0
|
396 |
matched_positions = set()
|
397 |
match_list = []
|
398 |
-
for
|
399 |
-
matches = list(re.finditer(
|
400 |
# Filter out matches that share positions with longer terms
|
401 |
matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
|
402 |
# matched_text = [match.group(0) for match in matches]
|
403 |
if len(matches) > 0:
|
404 |
if utt.role == "teacher":
|
405 |
-
if math_terms_dict[
|
406 |
-
teacher_math_word_cloud[math_terms_dict[
|
407 |
-
teacher_math_word_cloud[math_terms_dict[
|
408 |
else:
|
409 |
-
if math_terms_dict[
|
410 |
-
student_math_word_cloud[math_terms_dict[
|
411 |
-
student_math_word_cloud[math_terms_dict[
|
412 |
-
match_list.append(math_terms_dict[
|
413 |
# Update matched positions
|
414 |
matched_positions.update((match.start(), match.end()) for match in matches)
|
415 |
num_matches += len(matches)
|
|
|
374 |
return output
|
375 |
|
376 |
def load_math_terms():
|
377 |
+
math_regexes = []
|
378 |
math_terms_dict = {}
|
379 |
for term in MATH_WORDS:
|
380 |
if term in MATH_PREFIXES:
|
381 |
+
math_terms_dict[rf"\b{term}(s|es|d|ed)?\b"] = term
|
382 |
+
math_regexes.append(rf"\b{term}(s|es|d|ed)?\b")
|
383 |
else:
|
384 |
+
math_regexes.append(rf"\b{term}\b")
|
385 |
+
math_terms_dict[rf"\b{term}\b"] = term
|
386 |
+
return math_regexes, math_terms_dict
|
387 |
|
388 |
def run_math_density(transcript):
|
389 |
+
math_regexes, math_terms_dict = load_math_terms()
|
390 |
+
sorted_regexes = sorted(math_regexes, key=len, reverse=True)
|
391 |
teacher_math_word_cloud = {}
|
392 |
student_math_word_cloud = {}
|
393 |
for i, utt in enumerate(transcript.utterances):
|
|
|
395 |
num_matches = 0
|
396 |
matched_positions = set()
|
397 |
match_list = []
|
398 |
+
for regex in sorted_regexes:
|
399 |
+
matches = list(re.finditer(regex, text, re.IGNORECASE))
|
400 |
# Filter out matches that share positions with longer terms
|
401 |
matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
|
402 |
# matched_text = [match.group(0) for match in matches]
|
403 |
if len(matches) > 0:
|
404 |
if utt.role == "teacher":
|
405 |
+
if math_terms_dict[regex] not in teacher_math_word_cloud:
|
406 |
+
teacher_math_word_cloud[math_terms_dict[regex]] = 0
|
407 |
+
teacher_math_word_cloud[math_terms_dict[regex]] += len(matches)
|
408 |
else:
|
409 |
+
if math_terms_dict[regex] not in student_math_word_cloud:
|
410 |
+
student_math_word_cloud[math_terms_dict[regex]] = 0
|
411 |
+
student_math_word_cloud[math_terms_dict[regex]] += len(matches)
|
412 |
+
match_list.append(math_terms_dict[regex])
|
413 |
# Update matched positions
|
414 |
matched_positions.update((match.start(), match.end()) for match in matches)
|
415 |
num_matches += len(matches)
|