hylee commited on
Commit
c2f7754
·
1 Parent(s): c3bae84
Files changed (1) hide show
  1. handler.py +0 -50
handler.py CHANGED
@@ -361,40 +361,14 @@ def load_math_terms():
361
  math_terms = []
362
  math_terms_dict = {}
363
  for term in MATH_WORDS:
364
- # math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
365
- # math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
366
  if term in MATH_PREFIXES:
367
  math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
368
  math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
369
  else:
370
  math_terms.append(term)
371
  math_terms_dict[term] = term
372
- # else:
373
- # math_terms_dict[f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)"] = term
374
- # math_terms.append(f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)")
375
- # logging.set_verbosity_info()
376
- # logger = logging.get_logger("transformers")
377
- # logger.info(f"maths terms values: {math_terms_dict.values()}")
378
- # return math_terms, math_terms_dict
379
  return math_terms, math_terms_dict
380
 
381
- # def run_math_density(transcript):
382
- # math_terms, math_terms_dict = load_math_terms()
383
- # for i, utt in enumerate(transcript.utterances):
384
- # found_math_terms = set()
385
- # text = utt.get_clean_text(remove_punct=False)
386
- # logging.set_verbosity_info()
387
- # logger = logging.get_logger("transformers")
388
- # # logger.info(f"clean text in math density: {text}")
389
- # num_math_terms = 0
390
- # for term in math_terms:
391
- # count = len(re.findall(term, text))
392
- # if count > 0:
393
- # found_math_terms.add(math_terms_dict[term])
394
- # num_math_terms += count
395
- # utt.num_math_terms = num_math_terms
396
- # utt.math_terms = list(found_math_terms)
397
-
398
  def run_math_density(transcript):
399
  math_terms, math_terms_dict = load_math_terms()
400
  sorted_terms = sorted(math_terms, key=len, reverse=True)
@@ -404,41 +378,17 @@ def run_math_density(transcript):
404
  matched_positions = set()
405
  match_list = []
406
  for term in sorted_terms:
407
- # Use re.finditer to find all non-overlapping match objects
408
  matches = list(re.finditer(term, text, re.IGNORECASE))
409
- # count = len(re.findall(term, input_string))
410
- # print('term: ', term)
411
- # print("count with findall: ", count)
412
  # Filter out matches that share positions with longer terms
413
  matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
414
  if len(matches) > 0:
415
  match_list.append(math_terms_dict[term])
416
  # Update matched positions
417
  matched_positions.update((match.start(), match.end()) for match in matches)
418
- # Count the number of matches
419
  num_matches += len(matches)
420
  utt.num_math_terms = num_matches
421
  utt.math_terms = match_list
422
 
423
-
424
-
425
- # def gloss_check_vec(s):
426
- # gloss =
427
- # # Sort glossary terms by length in descending order
428
- # sorted_gloss = sorted(gloss, key=len, reverse=True)
429
-
430
- # # Create a logical vector indicating whether each term in 'gloss' is found in 's'
431
- # gloss_found_dict = {}
432
- # for g in sorted_gloss:
433
- # if re.search(re.escape(g), s, re.IGNORECASE):
434
- # gloss_found_dict[g] = True
435
- # else:
436
- # gloss_found_dict[g] = False
437
-
438
- # # Return the resulting logical vector
439
- # return gloss_found_dict
440
-
441
-
442
  class EndpointHandler():
443
  def __init__(self, path="."):
444
  print("Loading models...")
 
361
  math_terms = []
362
  math_terms_dict = {}
363
  for term in MATH_WORDS:
 
 
364
  if term in MATH_PREFIXES:
365
  math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
366
  math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
367
  else:
368
  math_terms.append(term)
369
  math_terms_dict[term] = term
 
 
 
 
 
 
 
370
  return math_terms, math_terms_dict
371
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
  def run_math_density(transcript):
373
  math_terms, math_terms_dict = load_math_terms()
374
  sorted_terms = sorted(math_terms, key=len, reverse=True)
 
378
  matched_positions = set()
379
  match_list = []
380
  for term in sorted_terms:
 
381
  matches = list(re.finditer(term, text, re.IGNORECASE))
 
 
 
382
  # Filter out matches that share positions with longer terms
383
  matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
384
  if len(matches) > 0:
385
  match_list.append(math_terms_dict[term])
386
  # Update matched positions
387
  matched_positions.update((match.start(), match.end()) for match in matches)
 
388
  num_matches += len(matches)
389
  utt.num_math_terms = num_matches
390
  utt.math_terms = match_list
391
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
392
  class EndpointHandler():
393
  def __init__(self, path="."):
394
  print("Loading models...")