hylee719 commited on
Commit
4c48e8e
1 Parent(s): 0da9196

add math terms

Browse files
Files changed (1) hide show
  1. handler.py +32 -2
handler.py CHANGED
@@ -2,10 +2,11 @@ from typing import Dict, List, Any
2
  from scipy.special import softmax
3
  import numpy as np
4
  import weakref
 
5
 
6
  from utils import clean_str, clean_str_nopunct
7
  import torch
8
- from utils import MultiHeadModel, BertInputBuilder, get_num_words
9
 
10
  import transformers
11
  from transformers import BertTokenizer, BertForSequenceClassification
@@ -29,6 +30,8 @@ class Utterance:
29
  self.endtime = endtime
30
  self.transcript = weakref.ref(transcript) if transcript else None
31
  self.props = kwargs
 
 
32
 
33
  self.uptake = None
34
  self.reasoning = None
@@ -53,7 +56,9 @@ class Utterance:
53
  'uptake': self.uptake,
54
  'reasoning': self.reasoning,
55
  'question': self.question,
56
- 'focusingquestion': self.focusing_question,
 
 
57
  **self.props
58
  }
59
 
@@ -252,6 +257,29 @@ class FocusingQuestionModel:
252
  return output
253
 
254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  class EndpointHandler():
256
  def __init__(self, path="."):
257
  print("Loading models...")
@@ -304,4 +332,6 @@ class EndpointHandler():
304
  self.device, self.tokenizer, self.input_builder)
305
  focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
306
 
 
 
307
  return transcript.to_dict()
 
2
  from scipy.special import softmax
3
  import numpy as np
4
  import weakref
5
+ import re
6
 
7
  from utils import clean_str, clean_str_nopunct
8
  import torch
9
+ from utils import MultiHeadModel, BertInputBuilder, get_num_words, MATH_PREFIXES, MATH_WORDS
10
 
11
  import transformers
12
  from transformers import BertTokenizer, BertForSequenceClassification
 
30
  self.endtime = endtime
31
  self.transcript = weakref.ref(transcript) if transcript else None
32
  self.props = kwargs
33
+ self.num_math_terms = None
34
+ self.math_terms = None
35
 
36
  self.uptake = None
37
  self.reasoning = None
 
56
  'uptake': self.uptake,
57
  'reasoning': self.reasoning,
58
  'question': self.question,
59
+ 'focusingQuestion': self.focusing_question,
60
+ 'numMathTerms': self.num_math_terms,
61
+ 'mathTerms': self.math_terms,
62
  **self.props
63
  }
64
 
 
257
  return output
258
 
259
 
260
+ def load_math_terms():
261
+ math_terms = []
262
+ for term in MATH_WORDS:
263
+ if term in MATH_PREFIXES:
264
+ math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
265
+ else:
266
+ math_terms.append(f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)")
267
+ return math_terms
268
+
269
+ def run_math_density(transcript):
270
+ math_terms = load_math_terms()
271
+ for i, utt in enumerate(transcript.utterances):
272
+ found_math_terms = set()
273
+ text = utt.get_clean_text(remove_punct=False)
274
+ num_math_terms = 0
275
+ for term in math_terms:
276
+ count = len(re.findall(term, text))
277
+ if count > 0:
278
+ found_math_terms.add(term)
279
+ num_math_terms += count
280
+ utt.num_math_terms = num_math_terms
281
+ utt.math_terms = list(found_math_terms)
282
+
283
  class EndpointHandler():
284
  def __init__(self, path="."):
285
  print("Loading models...")
 
332
  self.device, self.tokenizer, self.input_builder)
333
  focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
334
 
335
+ run_math_density(transcript)
336
+
337
  return transcript.to_dict()