lsy641 commited on
Commit
577b530
·
1 Parent(s): 6563183
__pycache__/distinct.cpython-38.pyc CHANGED
Binary files a/__pycache__/distinct.cpython-38.pyc and b/__pycache__/distinct.cpython-38.pyc differ
 
distinct.py CHANGED
@@ -115,8 +115,9 @@ class distinct(evaluate.Measurement):
115
  def _download_and_prepare(self, dl_manager):
116
  """Optional: download external resources useful to compute the scores"""
117
 
118
- def _compute(self, predictions, dataForVocabCal=None, vocab_size=None, tokenizer=Tokenizer13a(), mode="Expectation-Adjusted-Distinct"):
119
  from nltk.util import ngrams
 
120
 
121
 
122
 
@@ -135,10 +136,10 @@ class distinct(evaluate.Measurement):
135
  if isinstance(dataForVocabCal, list) and len(dataForVocabCal) > 0 and isinstance(dataForVocabCal[0], str):
136
  vocab = set()
137
  for sentence in dataForVocabCal:
138
- if tokenizer == "white_space":
139
- vocab = vocab | set(sentence.split(" "))
140
- else:
141
- vocab = vocab | set(tokenizer.tokenize(sentence))
142
  vocab_size = len(vocab)
143
  else:
144
  raise TypeError("Argument dataForVocabCal should be a list of strings")
@@ -152,6 +153,7 @@ class distinct(evaluate.Measurement):
152
  for prediction in predictions:
153
  try:
154
  tokens = list(tokenizer.tokenize(prediction))
 
155
  tokens_2grams = list(ngrams(list(tokenizer.tokenize(prediction)), 2, pad_left=True, left_pad_symbol='<s>'))
156
  tokens_3grams = list(ngrams(list(tokenizer.tokenize(prediction)), 3, pad_left=True, left_pad_symbol='<s>'))
157
  except Exception as e:
 
115
  def _download_and_prepare(self, dl_manager):
116
  """Optional: download external resources useful to compute the scores"""
117
 
118
+ def _compute(self, predictions, dataForVocabCal=None, vocab_size=None, tokenizer="white_space", mode="Expectation-Adjusted-Distinct"):
119
  from nltk.util import ngrams
120
+ from nltk.tokenize import WhitespaceTokenizer
121
 
122
 
123
 
 
136
  if isinstance(dataForVocabCal, list) and len(dataForVocabCal) > 0 and isinstance(dataForVocabCal[0], str):
137
  vocab = set()
138
  for sentence in dataForVocabCal:
139
+ # if tokenizer == "white_space":
140
+ # vocab = vocab | set(sentence.split(" "))
141
+ # else:
142
+ vocab = vocab | set(tokenizer.tokenize(sentence))
143
  vocab_size = len(vocab)
144
  else:
145
  raise TypeError("Argument dataForVocabCal should be a list of strings")
 
153
  for prediction in predictions:
154
  try:
155
  tokens = list(tokenizer.tokenize(prediction))
156
+ print(tokens)
157
  tokens_2grams = list(ngrams(list(tokenizer.tokenize(prediction)), 2, pad_left=True, left_pad_symbol='<s>'))
158
  tokens_3grams = list(ngrams(list(tokenizer.tokenize(prediction)), 3, pad_left=True, left_pad_symbol='<s>'))
159
  except Exception as e:
tokenizer_13a.py CHANGED
@@ -98,3 +98,7 @@ class Tokenizer13a(BaseTokenizer):
98
  line = line.replace("&gt;", ">")
99
 
100
  return self._post_tokenizer(f" {line} ")
 
 
 
 
 
98
  line = line.replace("&gt;", ">")
99
 
100
  return self._post_tokenizer(f" {line} ")
101
+
102
+ @lru_cache(maxsize=2**16)
103
+ def tokenize(self, line):
104
+ self.__call__(self, line)