idkash1 commited on
Commit
e24467d
·
verified ·
1 Parent(s): 41afd3c

Update src/DetectLM.py

Browse files
Files changed (1) hide show
  1. src/DetectLM.py +5 -4
src/DetectLM.py CHANGED
@@ -11,7 +11,7 @@ def truncae_to_max_no_tokens(text, max_no_tokens):
11
 
12
  class DetectLM(object):
13
  def __init__(self, sentence_detection_function, survival_function_per_length,
14
- min_len=4, max_len=100, HC_type="stbl",
15
  length_limit_policy='truncate', ignore_first_sentence=False):
16
  """
17
  Test for the presence of sentences of irregular origin as reflected by the
@@ -41,6 +41,7 @@ class DetectLM(object):
41
  self.length_limit_policy = length_limit_policy
42
  self.ignore_first_sentence = ignore_first_sentence
43
  self.HC_stbl = True if HC_type == 'stbl' else False
 
44
 
45
  def _logperp(self, sent: str, context=None) -> float:
46
  return float(self.sentence_detector(sent, context))
@@ -136,7 +137,7 @@ class DetectLM(object):
136
  def testHC(self, sentences: list) -> float:
137
  pvals = np.array(self.get_pvals(sentences)[1])
138
  mt = MultiTest(pvals, stbl=self.HC_stbl)
139
- return mt.hc(gamma=0.4)[0]
140
 
141
  def testFisher(self, sentences: list) -> dict:
142
  pvals = np.array(self.get_pvals(sentences)[1])
@@ -167,11 +168,11 @@ class DetectLM(object):
167
  fisher = (np.nan, np.nan)
168
  df['mask'] = pd.NA
169
  else:
170
- hc, hct = mt.hc(gamma=0.4)
171
  fisher = mt.fisher()
172
  df['mask'] = df['pvalue'] <= hct
173
  if dashboard:
174
- mt.hc_dashboard(gamma=0.4)
175
  return dict(sentences=df, HC=hc, fisher=fisher[0], fisher_pvalue=fisher[1])
176
 
177
  def __call__(self, lo_chunks: list, lo_contexts: list, dashboard=False) -> dict:
 
11
 
12
  class DetectLM(object):
13
  def __init__(self, sentence_detection_function, survival_function_per_length,
14
+ min_len=4, max_len=100, HC_type="stbl", gamma=0.15,
15
  length_limit_policy='truncate', ignore_first_sentence=False):
16
  """
17
  Test for the presence of sentences of irregular origin as reflected by the
 
41
  self.length_limit_policy = length_limit_policy
42
  self.ignore_first_sentence = ignore_first_sentence
43
  self.HC_stbl = True if HC_type == 'stbl' else False
44
+ self.gamma = gamma
45
 
46
  def _logperp(self, sent: str, context=None) -> float:
47
  return float(self.sentence_detector(sent, context))
 
137
  def testHC(self, sentences: list) -> float:
138
  pvals = np.array(self.get_pvals(sentences)[1])
139
  mt = MultiTest(pvals, stbl=self.HC_stbl)
140
+ return mt.hc(gamma=self.gamma)[0]
141
 
142
  def testFisher(self, sentences: list) -> dict:
143
  pvals = np.array(self.get_pvals(sentences)[1])
 
168
  fisher = (np.nan, np.nan)
169
  df['mask'] = pd.NA
170
  else:
171
+ hc, hct = mt.hc(gamma=self.gamma)
172
  fisher = mt.fisher()
173
  df['mask'] = df['pvalue'] <= hct
174
  if dashboard:
175
+ mt.hc_dashboard(gamma=self.gamma)
176
  return dict(sentences=df, HC=hc, fisher=fisher[0], fisher_pvalue=fisher[1])
177
 
178
  def __call__(self, lo_chunks: list, lo_contexts: list, dashboard=False) -> dict: