hylee
commited on
Commit
·
c3bae84
1
Parent(s):
454b944
revise math term checking
Browse files- handler.py +69 -21
handler.py
CHANGED
@@ -361,35 +361,83 @@ def load_math_terms():
|
|
361 |
math_terms = []
|
362 |
math_terms_dict = {}
|
363 |
for term in MATH_WORDS:
|
364 |
-
math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
|
365 |
-
math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
|
366 |
-
|
367 |
-
|
368 |
-
|
|
|
|
|
|
|
369 |
# else:
|
370 |
# math_terms_dict[f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)"] = term
|
371 |
# math_terms.append(f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)")
|
372 |
-
|
373 |
-
|
374 |
-
|
|
|
375 |
return math_terms, math_terms_dict
|
376 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
377 |
def run_math_density(transcript):
|
378 |
math_terms, math_terms_dict = load_math_terms()
|
379 |
-
|
380 |
-
|
381 |
text = utt.get_clean_text(remove_punct=False)
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
393 |
|
394 |
class EndpointHandler():
|
395 |
def __init__(self, path="."):
|
|
|
361 |
math_terms = []
|
362 |
math_terms_dict = {}
|
363 |
for term in MATH_WORDS:
|
364 |
+
# math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
|
365 |
+
# math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
|
366 |
+
if term in MATH_PREFIXES:
|
367 |
+
math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
|
368 |
+
math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
|
369 |
+
else:
|
370 |
+
math_terms.append(term)
|
371 |
+
math_terms_dict[term] = term
|
372 |
# else:
|
373 |
# math_terms_dict[f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)"] = term
|
374 |
# math_terms.append(f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)")
|
375 |
+
# logging.set_verbosity_info()
|
376 |
+
# logger = logging.get_logger("transformers")
|
377 |
+
# logger.info(f"maths terms values: {math_terms_dict.values()}")
|
378 |
+
# return math_terms, math_terms_dict
|
379 |
return math_terms, math_terms_dict
|
380 |
|
381 |
+
# def run_math_density(transcript):
|
382 |
+
# math_terms, math_terms_dict = load_math_terms()
|
383 |
+
# for i, utt in enumerate(transcript.utterances):
|
384 |
+
# found_math_terms = set()
|
385 |
+
# text = utt.get_clean_text(remove_punct=False)
|
386 |
+
# logging.set_verbosity_info()
|
387 |
+
# logger = logging.get_logger("transformers")
|
388 |
+
# # logger.info(f"clean text in math density: {text}")
|
389 |
+
# num_math_terms = 0
|
390 |
+
# for term in math_terms:
|
391 |
+
# count = len(re.findall(term, text))
|
392 |
+
# if count > 0:
|
393 |
+
# found_math_terms.add(math_terms_dict[term])
|
394 |
+
# num_math_terms += count
|
395 |
+
# utt.num_math_terms = num_math_terms
|
396 |
+
# utt.math_terms = list(found_math_terms)
|
397 |
+
|
398 |
def run_math_density(transcript):
|
399 |
math_terms, math_terms_dict = load_math_terms()
|
400 |
+
sorted_terms = sorted(math_terms, key=len, reverse=True)
|
401 |
+
for i, utt in enumerate(transcript.utterances):
|
402 |
text = utt.get_clean_text(remove_punct=False)
|
403 |
+
num_matches = 0
|
404 |
+
matched_positions = set()
|
405 |
+
match_list = []
|
406 |
+
for term in sorted_terms:
|
407 |
+
# Use re.finditer to find all non-overlapping match objects
|
408 |
+
matches = list(re.finditer(term, text, re.IGNORECASE))
|
409 |
+
# count = len(re.findall(term, input_string))
|
410 |
+
# print('term: ', term)
|
411 |
+
# print("count with findall: ", count)
|
412 |
+
# Filter out matches that share positions with longer terms
|
413 |
+
matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
|
414 |
+
if len(matches) > 0:
|
415 |
+
match_list.append(math_terms_dict[term])
|
416 |
+
# Update matched positions
|
417 |
+
matched_positions.update((match.start(), match.end()) for match in matches)
|
418 |
+
# Count the number of matches
|
419 |
+
num_matches += len(matches)
|
420 |
+
utt.num_math_terms = num_matches
|
421 |
+
utt.math_terms = match_list
|
422 |
+
|
423 |
+
|
424 |
+
|
425 |
+
# def gloss_check_vec(s):
|
426 |
+
# gloss =
|
427 |
+
# # Sort glossary terms by length in descending order
|
428 |
+
# sorted_gloss = sorted(gloss, key=len, reverse=True)
|
429 |
+
|
430 |
+
# # Create a logical vector indicating whether each term in 'gloss' is found in 's'
|
431 |
+
# gloss_found_dict = {}
|
432 |
+
# for g in sorted_gloss:
|
433 |
+
# if re.search(re.escape(g), s, re.IGNORECASE):
|
434 |
+
# gloss_found_dict[g] = True
|
435 |
+
# else:
|
436 |
+
# gloss_found_dict[g] = False
|
437 |
+
|
438 |
+
# # Return the resulting logical vector
|
439 |
+
# return gloss_found_dict
|
440 |
+
|
441 |
|
442 |
class EndpointHandler():
|
443 |
def __init__(self, path="."):
|