math-words/fix-word-matching

#2
Files changed (1) hide show
  1. handler.py +17 -17
handler.py CHANGED
@@ -374,20 +374,20 @@ class FocusingQuestionModel:
374
  return output
375
 
376
  def load_math_terms():
377
- math_terms = []
378
  math_terms_dict = {}
379
  for term in MATH_WORDS:
380
  if term in MATH_PREFIXES:
381
- math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es|d|ed)?([^a-zA-Z]|$)"] = term
382
- math_terms.append(f"(^|[^a-zA-Z]){term}(s|es|d|ed)?([^a-zA-Z]|$)")
383
  else:
384
- math_terms.append(term)
385
- math_terms_dict[term] = term
386
- return math_terms, math_terms_dict
387
 
388
  def run_math_density(transcript):
389
- math_terms, math_terms_dict = load_math_terms()
390
- sorted_terms = sorted(math_terms, key=len, reverse=True)
391
  teacher_math_word_cloud = {}
392
  student_math_word_cloud = {}
393
  for i, utt in enumerate(transcript.utterances):
@@ -395,21 +395,21 @@ def run_math_density(transcript):
395
  num_matches = 0
396
  matched_positions = set()
397
  match_list = []
398
- for term in sorted_terms:
399
- matches = list(re.finditer(term, text, re.IGNORECASE))
400
  # Filter out matches that share positions with longer terms
401
  matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
402
  # matched_text = [match.group(0) for match in matches]
403
  if len(matches) > 0:
404
  if utt.role == "teacher":
405
- if math_terms_dict[term] not in teacher_math_word_cloud:
406
- teacher_math_word_cloud[math_terms_dict[term]] = 0
407
- teacher_math_word_cloud[math_terms_dict[term]] += len(matches)
408
  else:
409
- if math_terms_dict[term] not in student_math_word_cloud:
410
- student_math_word_cloud[math_terms_dict[term]] = 0
411
- student_math_word_cloud[math_terms_dict[term]] += len(matches)
412
- match_list.append(math_terms_dict[term])
413
  # Update matched positions
414
  matched_positions.update((match.start(), match.end()) for match in matches)
415
  num_matches += len(matches)
 
374
  return output
375
 
376
  def load_math_terms():
377
+ math_regexes = []
378
  math_terms_dict = {}
379
  for term in MATH_WORDS:
380
  if term in MATH_PREFIXES:
381
+ math_terms_dict[rf"\b{term}(s|es|d|ed)?\b"] = term
382
+ math_regexes.append(rf"\b{term}(s|es|d|ed)?\b")
383
  else:
384
+ math_regexes.append(rf"\b{term}\b")
385
+ math_terms_dict[rf"\b{term}\b"] = term
386
+ return math_regexes, math_terms_dict
387
 
388
  def run_math_density(transcript):
389
+ math_regexes, math_terms_dict = load_math_terms()
390
+ sorted_regexes = sorted(math_regexes, key=len, reverse=True)
391
  teacher_math_word_cloud = {}
392
  student_math_word_cloud = {}
393
  for i, utt in enumerate(transcript.utterances):
 
395
  num_matches = 0
396
  matched_positions = set()
397
  match_list = []
398
+ for regex in sorted_regexes:
399
+ matches = list(re.finditer(regex, text, re.IGNORECASE))
400
  # Filter out matches that share positions with longer terms
401
  matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
402
  # matched_text = [match.group(0) for match in matches]
403
  if len(matches) > 0:
404
  if utt.role == "teacher":
405
+ if math_terms_dict[regex] not in teacher_math_word_cloud:
406
+ teacher_math_word_cloud[math_terms_dict[regex]] = 0
407
+ teacher_math_word_cloud[math_terms_dict[regex]] += len(matches)
408
  else:
409
+ if math_terms_dict[regex] not in student_math_word_cloud:
410
+ student_math_word_cloud[math_terms_dict[regex]] = 0
411
+ student_math_word_cloud[math_terms_dict[regex]] += len(matches)
412
+ match_list.append(math_terms_dict[regex])
413
  # Update matched positions
414
  matched_positions.update((match.start(), match.end()) for match in matches)
415
  num_matches += len(matches)