hylee commited on
Commit
c3bae84
·
1 Parent(s): 454b944

revise math term checking

Browse files
Files changed (1) hide show
  1. handler.py +69 -21
handler.py CHANGED
@@ -361,35 +361,83 @@ def load_math_terms():
361
  math_terms = []
362
  math_terms_dict = {}
363
  for term in MATH_WORDS:
364
- math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
365
- math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
366
- # if term in MATH_PREFIXES:
367
- # math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
368
- # math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
 
 
 
369
  # else:
370
  # math_terms_dict[f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)"] = term
371
  # math_terms.append(f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)")
372
- logging.set_verbosity_info()
373
- logger = logging.get_logger("transformers")
374
- logger.info(f"maths terms values: {math_terms_dict.values()}")
 
375
  return math_terms, math_terms_dict
376
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
  def run_math_density(transcript):
378
  math_terms, math_terms_dict = load_math_terms()
379
- for i, utt in enumerate(transcript.utterances):
380
- found_math_terms = set()
381
  text = utt.get_clean_text(remove_punct=False)
382
- logging.set_verbosity_info()
383
- logger = logging.get_logger("transformers")
384
- logger.info(f"clean text in math density: {text}")
385
- num_math_terms = 0
386
- for term in math_terms:
387
- count = len(re.findall(term, text))
388
- if count > 0:
389
- found_math_terms.add(math_terms_dict[term])
390
- num_math_terms += count
391
- utt.num_math_terms = num_math_terms
392
- utt.math_terms = list(found_math_terms)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393
 
394
  class EndpointHandler():
395
  def __init__(self, path="."):
 
361
  math_terms = []
362
  math_terms_dict = {}
363
  for term in MATH_WORDS:
364
+ # math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
365
+ # math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
366
+ if term in MATH_PREFIXES:
367
+ math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
368
+ math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
369
+ else:
370
+ math_terms.append(term)
371
+ math_terms_dict[term] = term
372
  # else:
373
  # math_terms_dict[f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)"] = term
374
  # math_terms.append(f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)")
375
+ # logging.set_verbosity_info()
376
+ # logger = logging.get_logger("transformers")
377
+ # logger.info(f"maths terms values: {math_terms_dict.values()}")
378
+ # return math_terms, math_terms_dict
379
  return math_terms, math_terms_dict
380
 
381
+ # def run_math_density(transcript):
382
+ # math_terms, math_terms_dict = load_math_terms()
383
+ # for i, utt in enumerate(transcript.utterances):
384
+ # found_math_terms = set()
385
+ # text = utt.get_clean_text(remove_punct=False)
386
+ # logging.set_verbosity_info()
387
+ # logger = logging.get_logger("transformers")
388
+ # # logger.info(f"clean text in math density: {text}")
389
+ # num_math_terms = 0
390
+ # for term in math_terms:
391
+ # count = len(re.findall(term, text))
392
+ # if count > 0:
393
+ # found_math_terms.add(math_terms_dict[term])
394
+ # num_math_terms += count
395
+ # utt.num_math_terms = num_math_terms
396
+ # utt.math_terms = list(found_math_terms)
397
+
398
  def run_math_density(transcript):
399
  math_terms, math_terms_dict = load_math_terms()
400
+ sorted_terms = sorted(math_terms, key=len, reverse=True)
401
+ for i, utt in enumerate(transcript.utterances):
402
  text = utt.get_clean_text(remove_punct=False)
403
+ num_matches = 0
404
+ matched_positions = set()
405
+ match_list = []
406
+ for term in sorted_terms:
407
+ # Use re.finditer to find all non-overlapping match objects
408
+ matches = list(re.finditer(term, text, re.IGNORECASE))
409
+ # count = len(re.findall(term, input_string))
410
+ # print('term: ', term)
411
+ # print("count with findall: ", count)
412
+ # Filter out matches that share positions with longer terms
413
+ matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
414
+ if len(matches) > 0:
415
+ match_list.append(math_terms_dict[term])
416
+ # Update matched positions
417
+ matched_positions.update((match.start(), match.end()) for match in matches)
418
+ # Count the number of matches
419
+ num_matches += len(matches)
420
+ utt.num_math_terms = num_matches
421
+ utt.math_terms = match_list
422
+
423
+
424
+
425
+ # def gloss_check_vec(s):
426
+ # gloss =
427
+ # # Sort glossary terms by length in descending order
428
+ # sorted_gloss = sorted(gloss, key=len, reverse=True)
429
+
430
+ # # Create a logical vector indicating whether each term in 'gloss' is found in 's'
431
+ # gloss_found_dict = {}
432
+ # for g in sorted_gloss:
433
+ # if re.search(re.escape(g), s, re.IGNORECASE):
434
+ # gloss_found_dict[g] = True
435
+ # else:
436
+ # gloss_found_dict[g] = False
437
+
438
+ # # Return the resulting logical vector
439
+ # return gloss_found_dict
440
+
441
 
442
  class EndpointHandler():
443
  def __init__(self, path="."):