versae commited on
Commit
f562f06
·
1 Parent(s): f965ae3

Adding Numpy random number generator

Browse files
Files changed (1) hide show
  1. mc4/mc4.py +5 -4
mc4/mc4.py CHANGED
@@ -7,6 +7,8 @@ import json
7
  import datasets
8
  import kenlm
9
  import numpy as np
 
 
10
 
11
 
12
  logger = datasets.logging.get_logger(__name__)
@@ -309,7 +311,6 @@ class Mc4(datasets.GeneratorBasedBuilder):
309
  doc_length += length
310
  return 10.0 ** (-doc_log_score / doc_length)
311
 
312
-
313
  def _should_keep_doc_step(self, doc, factor=1, boundaries=None):
314
  perplexity = self.get_perplexity(doc)
315
  if boundaries is None:
@@ -323,7 +324,7 @@ class Mc4(datasets.GeneratorBasedBuilder):
323
  elif perplexity >= boundaries[2]:
324
  quartile_range = 100 * boundaries[2]
325
  probability = factor / quartile_range
326
- return np.random() < probability
327
 
328
  def _should_keep_doc_gaussian(self, doc, factor=0.4, boundaries=None):
329
  perplexity = self.get_perplexity(doc)
@@ -332,10 +333,10 @@ class Mc4(datasets.GeneratorBasedBuilder):
332
  else:
333
  m = 662247.50212365
334
  weighted_perplexity = factor * np.exp(-9/2*((perplexity-m)/m)**2)
335
- return np.random.uniform() < weighted_perplexity
336
 
337
  def _should_keep_doc_random(self, doc, factor=None, boundaries=None):
338
- return np.random() <= 0.5
339
 
340
  def _info(self):
341
  return datasets.DatasetInfo(
 
7
  import datasets
8
  import kenlm
9
  import numpy as np
10
+ from numpy.random import default_rng
11
+ rng = default_rng()
12
 
13
 
14
  logger = datasets.logging.get_logger(__name__)
 
311
  doc_length += length
312
  return 10.0 ** (-doc_log_score / doc_length)
313
 
 
314
  def _should_keep_doc_step(self, doc, factor=1, boundaries=None):
315
  perplexity = self.get_perplexity(doc)
316
  if boundaries is None:
 
324
  elif perplexity >= boundaries[2]:
325
  quartile_range = 100 * boundaries[2]
326
  probability = factor / quartile_range
327
+ return rng.uniform() < probability
328
 
329
  def _should_keep_doc_gaussian(self, doc, factor=0.4, boundaries=None):
330
  perplexity = self.get_perplexity(doc)
 
333
  else:
334
  m = 662247.50212365
335
  weighted_perplexity = factor * np.exp(-9/2*((perplexity-m)/m)**2)
336
+ return rng.uniform() < weighted_perplexity
337
 
338
  def _should_keep_doc_random(self, doc, factor=None, boundaries=None):
339
+ return rng.uniform() <= 0.5
340
 
341
  def _info(self):
342
  return datasets.DatasetInfo(