hylee719
/

transcript-analysis-testing

Model card Files Files and versions

xet

Community

hylee commited on Nov 16, 2023

Commit

c2f7754

1 Parent(s): c3bae84

clean up

Browse files

Files changed (1) hide show

handler.py +0 -50

handler.py CHANGED Viewed

@@ -361,40 +361,14 @@ def load_math_terms():
     math_terms = []
     math_terms_dict = {}
     for term in MATH_WORDS:
-        # math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
-        # math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
         if term in MATH_PREFIXES:
             math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
             math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
         else:
             math_terms.append(term)
             math_terms_dict[term] = term
-        # else:
-        #     math_terms_dict[f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)"] = term
-        #     math_terms.append(f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)")
-    #     logging.set_verbosity_info()
-    #     logger = logging.get_logger("transformers")
-    #     logger.info(f"maths terms values: {math_terms_dict.values()}")
-    # return math_terms, math_terms_dict
     return math_terms, math_terms_dict
-# def run_math_density(transcript):
-#     math_terms, math_terms_dict = load_math_terms()
-#     for i, utt in enumerate(transcript.utterances):
-#         found_math_terms = set()
-#         text = utt.get_clean_text(remove_punct=False)
-#         logging.set_verbosity_info()
-#         logger = logging.get_logger("transformers")
-#         # logger.info(f"clean text in math density: {text}")
-#         num_math_terms = 0
-#         for term in math_terms:
-#             count = len(re.findall(term, text))
-#             if count > 0:
-#                 found_math_terms.add(math_terms_dict[term])
-#             num_math_terms += count
-#         utt.num_math_terms = num_math_terms
-#         utt.math_terms = list(found_math_terms)
 def run_math_density(transcript):
     math_terms, math_terms_dict = load_math_terms()
     sorted_terms = sorted(math_terms, key=len, reverse=True)
@@ -404,41 +378,17 @@ def run_math_density(transcript):
         matched_positions = set()
         match_list = []
         for term in sorted_terms:
-            # Use re.finditer to find all non-overlapping match objects
             matches = list(re.finditer(term, text, re.IGNORECASE))
-            # count = len(re.findall(term, input_string))
-            # print('term: ', term)
-            # print("count with findall: ", count)
             # Filter out matches that share positions with longer terms
             matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
             if len(matches) > 0:
                 match_list.append(math_terms_dict[term])
             # Update matched positions
             matched_positions.update((match.start(), match.end()) for match in matches)
-            # Count the number of matches
             num_matches += len(matches)
         utt.num_math_terms = num_matches
         utt.math_terms = match_list
-# def gloss_check_vec(s):
-#     gloss =
-#     # Sort glossary terms by length in descending order
-#     sorted_gloss = sorted(gloss, key=len, reverse=True)
-#     # Create a logical vector indicating whether each term in 'gloss' is found in 's'
-#     gloss_found_dict = {}
-#     for g in sorted_gloss:
-#         if re.search(re.escape(g), s, re.IGNORECASE):
-#             gloss_found_dict[g] = True
-#         else:
-#             gloss_found_dict[g] = False
-#     # Return the resulting logical vector
-#     return gloss_found_dict
 class EndpointHandler():
     def __init__(self, path="."):
         print("Loading models...")

     math_terms = []
     math_terms_dict = {}
     for term in MATH_WORDS:
         if term in MATH_PREFIXES:
             math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
             math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
         else:
             math_terms.append(term)
             math_terms_dict[term] = term
     return math_terms, math_terms_dict
 def run_math_density(transcript):
     math_terms, math_terms_dict = load_math_terms()
     sorted_terms = sorted(math_terms, key=len, reverse=True)
         matched_positions = set()
         match_list = []
         for term in sorted_terms:
             matches = list(re.finditer(term, text, re.IGNORECASE))
             # Filter out matches that share positions with longer terms
             matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
             if len(matches) > 0:
                 match_list.append(math_terms_dict[term])
             # Update matched positions
             matched_positions.update((match.start(), match.end()) for match in matches)
             num_matches += len(matches)
         utt.num_math_terms = num_matches
         utt.math_terms = match_list
 class EndpointHandler():
     def __init__(self, path="."):
         print("Loading models...")