Update src/DetectLM.py
Browse files- src/DetectLM.py +5 -4
src/DetectLM.py
CHANGED
@@ -11,7 +11,7 @@ def truncae_to_max_no_tokens(text, max_no_tokens):
|
|
11 |
|
12 |
class DetectLM(object):
|
13 |
def __init__(self, sentence_detection_function, survival_function_per_length,
|
14 |
-
min_len=4, max_len=100, HC_type="stbl",
|
15 |
length_limit_policy='truncate', ignore_first_sentence=False):
|
16 |
"""
|
17 |
Test for the presence of sentences of irregular origin as reflected by the
|
@@ -41,6 +41,7 @@ class DetectLM(object):
|
|
41 |
self.length_limit_policy = length_limit_policy
|
42 |
self.ignore_first_sentence = ignore_first_sentence
|
43 |
self.HC_stbl = True if HC_type == 'stbl' else False
|
|
|
44 |
|
45 |
def _logperp(self, sent: str, context=None) -> float:
|
46 |
return float(self.sentence_detector(sent, context))
|
@@ -136,7 +137,7 @@ class DetectLM(object):
|
|
136 |
def testHC(self, sentences: list) -> float:
|
137 |
pvals = np.array(self.get_pvals(sentences)[1])
|
138 |
mt = MultiTest(pvals, stbl=self.HC_stbl)
|
139 |
-
return mt.hc(gamma=
|
140 |
|
141 |
def testFisher(self, sentences: list) -> dict:
|
142 |
pvals = np.array(self.get_pvals(sentences)[1])
|
@@ -167,11 +168,11 @@ class DetectLM(object):
|
|
167 |
fisher = (np.nan, np.nan)
|
168 |
df['mask'] = pd.NA
|
169 |
else:
|
170 |
-
hc, hct = mt.hc(gamma=
|
171 |
fisher = mt.fisher()
|
172 |
df['mask'] = df['pvalue'] <= hct
|
173 |
if dashboard:
|
174 |
-
mt.hc_dashboard(gamma=
|
175 |
return dict(sentences=df, HC=hc, fisher=fisher[0], fisher_pvalue=fisher[1])
|
176 |
|
177 |
def __call__(self, lo_chunks: list, lo_contexts: list, dashboard=False) -> dict:
|
|
|
11 |
|
12 |
class DetectLM(object):
|
13 |
def __init__(self, sentence_detection_function, survival_function_per_length,
|
14 |
+
min_len=4, max_len=100, HC_type="stbl", gamma=0.15,
|
15 |
length_limit_policy='truncate', ignore_first_sentence=False):
|
16 |
"""
|
17 |
Test for the presence of sentences of irregular origin as reflected by the
|
|
|
41 |
self.length_limit_policy = length_limit_policy
|
42 |
self.ignore_first_sentence = ignore_first_sentence
|
43 |
self.HC_stbl = True if HC_type == 'stbl' else False
|
44 |
+
self.gamma = gamma
|
45 |
|
46 |
def _logperp(self, sent: str, context=None) -> float:
|
47 |
return float(self.sentence_detector(sent, context))
|
|
|
137 |
def testHC(self, sentences: list) -> float:
|
138 |
pvals = np.array(self.get_pvals(sentences)[1])
|
139 |
mt = MultiTest(pvals, stbl=self.HC_stbl)
|
140 |
+
return mt.hc(gamma=self.gamma)[0]
|
141 |
|
142 |
def testFisher(self, sentences: list) -> dict:
|
143 |
pvals = np.array(self.get_pvals(sentences)[1])
|
|
|
168 |
fisher = (np.nan, np.nan)
|
169 |
df['mask'] = pd.NA
|
170 |
else:
|
171 |
+
hc, hct = mt.hc(gamma=self.gamma)
|
172 |
fisher = mt.fisher()
|
173 |
df['mask'] = df['pvalue'] <= hct
|
174 |
if dashboard:
|
175 |
+
mt.hc_dashboard(gamma=self.gamma)
|
176 |
return dict(sentences=df, HC=hc, fisher=fisher[0], fisher_pvalue=fisher[1])
|
177 |
|
178 |
def __call__(self, lo_chunks: list, lo_contexts: list, dashboard=False) -> dict:
|