Adding Numpy random number generator
Browse files- mc4/mc4.py +5 -4
mc4/mc4.py
CHANGED
@@ -7,6 +7,8 @@ import json
|
|
7 |
import datasets
|
8 |
import kenlm
|
9 |
import numpy as np
|
|
|
|
|
10 |
|
11 |
|
12 |
logger = datasets.logging.get_logger(__name__)
|
@@ -309,7 +311,6 @@ class Mc4(datasets.GeneratorBasedBuilder):
|
|
309 |
doc_length += length
|
310 |
return 10.0 ** (-doc_log_score / doc_length)
|
311 |
|
312 |
-
|
313 |
def _should_keep_doc_step(self, doc, factor=1, boundaries=None):
|
314 |
perplexity = self.get_perplexity(doc)
|
315 |
if boundaries is None:
|
@@ -323,7 +324,7 @@ class Mc4(datasets.GeneratorBasedBuilder):
|
|
323 |
elif perplexity >= boundaries[2]:
|
324 |
quartile_range = 100 * boundaries[2]
|
325 |
probability = factor / quartile_range
|
326 |
-
return
|
327 |
|
328 |
def _should_keep_doc_gaussian(self, doc, factor=0.4, boundaries=None):
|
329 |
perplexity = self.get_perplexity(doc)
|
@@ -332,10 +333,10 @@ class Mc4(datasets.GeneratorBasedBuilder):
|
|
332 |
else:
|
333 |
m = 662247.50212365
|
334 |
weighted_perplexity = factor * np.exp(-9/2*((perplexity-m)/m)**2)
|
335 |
-
return
|
336 |
|
337 |
def _should_keep_doc_random(self, doc, factor=None, boundaries=None):
|
338 |
-
return
|
339 |
|
340 |
def _info(self):
|
341 |
return datasets.DatasetInfo(
|
|
|
7 |
import datasets
|
8 |
import kenlm
|
9 |
import numpy as np
|
10 |
+
from numpy.random import default_rng
|
11 |
+
rng = default_rng()
|
12 |
|
13 |
|
14 |
logger = datasets.logging.get_logger(__name__)
|
|
|
311 |
doc_length += length
|
312 |
return 10.0 ** (-doc_log_score / doc_length)
|
313 |
|
|
|
314 |
def _should_keep_doc_step(self, doc, factor=1, boundaries=None):
|
315 |
perplexity = self.get_perplexity(doc)
|
316 |
if boundaries is None:
|
|
|
324 |
elif perplexity >= boundaries[2]:
|
325 |
quartile_range = 100 * boundaries[2]
|
326 |
probability = factor / quartile_range
|
327 |
+
return rng.uniform() < probability
|
328 |
|
329 |
def _should_keep_doc_gaussian(self, doc, factor=0.4, boundaries=None):
|
330 |
perplexity = self.get_perplexity(doc)
|
|
|
333 |
else:
|
334 |
m = 662247.50212365
|
335 |
weighted_perplexity = factor * np.exp(-9/2*((perplexity-m)/m)**2)
|
336 |
+
return rng.uniform() < weighted_perplexity
|
337 |
|
338 |
def _should_keep_doc_random(self, doc, factor=None, boundaries=None):
|
339 |
+
return rng.uniform() <= 0.5
|
340 |
|
341 |
def _info(self):
|
342 |
return datasets.DatasetInfo(
|