distinct
Browse files- __pycache__/distinct.cpython-38.pyc +0 -0
- distinct.py +7 -5
- tokenizer_13a.py +4 -0
__pycache__/distinct.cpython-38.pyc
CHANGED
Binary files a/__pycache__/distinct.cpython-38.pyc and b/__pycache__/distinct.cpython-38.pyc differ
|
|
distinct.py
CHANGED
@@ -115,8 +115,9 @@ class distinct(evaluate.Measurement):
|
|
115 |
def _download_and_prepare(self, dl_manager):
|
116 |
"""Optional: download external resources useful to compute the scores"""
|
117 |
|
118 |
-
def _compute(self, predictions, dataForVocabCal=None, vocab_size=None, tokenizer=
|
119 |
from nltk.util import ngrams
|
|
|
120 |
|
121 |
|
122 |
|
@@ -135,10 +136,10 @@ class distinct(evaluate.Measurement):
|
|
135 |
if isinstance(dataForVocabCal, list) and len(dataForVocabCal) > 0 and isinstance(dataForVocabCal[0], str):
|
136 |
vocab = set()
|
137 |
for sentence in dataForVocabCal:
|
138 |
-
if tokenizer == "white_space":
|
139 |
-
|
140 |
-
else:
|
141 |
-
|
142 |
vocab_size = len(vocab)
|
143 |
else:
|
144 |
raise TypeError("Argument dataForVocabCal should be a list of strings")
|
@@ -152,6 +153,7 @@ class distinct(evaluate.Measurement):
|
|
152 |
for prediction in predictions:
|
153 |
try:
|
154 |
tokens = list(tokenizer.tokenize(prediction))
|
|
|
155 |
tokens_2grams = list(ngrams(list(tokenizer.tokenize(prediction)), 2, pad_left=True, left_pad_symbol='<s>'))
|
156 |
tokens_3grams = list(ngrams(list(tokenizer.tokenize(prediction)), 3, pad_left=True, left_pad_symbol='<s>'))
|
157 |
except Exception as e:
|
|
|
115 |
def _download_and_prepare(self, dl_manager):
|
116 |
"""Optional: download external resources useful to compute the scores"""
|
117 |
|
118 |
+
def _compute(self, predictions, dataForVocabCal=None, vocab_size=None, tokenizer="white_space", mode="Expectation-Adjusted-Distinct"):
|
119 |
from nltk.util import ngrams
|
120 |
+
from nltk.tokenize import WhitespaceTokenizer
|
121 |
|
122 |
|
123 |
|
|
|
136 |
if isinstance(dataForVocabCal, list) and len(dataForVocabCal) > 0 and isinstance(dataForVocabCal[0], str):
|
137 |
vocab = set()
|
138 |
for sentence in dataForVocabCal:
|
139 |
+
# if tokenizer == "white_space":
|
140 |
+
# vocab = vocab | set(sentence.split(" "))
|
141 |
+
# else:
|
142 |
+
vocab = vocab | set(tokenizer.tokenize(sentence))
|
143 |
vocab_size = len(vocab)
|
144 |
else:
|
145 |
raise TypeError("Argument dataForVocabCal should be a list of strings")
|
|
|
153 |
for prediction in predictions:
|
154 |
try:
|
155 |
tokens = list(tokenizer.tokenize(prediction))
|
156 |
+
print(tokens)
|
157 |
tokens_2grams = list(ngrams(list(tokenizer.tokenize(prediction)), 2, pad_left=True, left_pad_symbol='<s>'))
|
158 |
tokens_3grams = list(ngrams(list(tokenizer.tokenize(prediction)), 3, pad_left=True, left_pad_symbol='<s>'))
|
159 |
except Exception as e:
|
tokenizer_13a.py
CHANGED
@@ -98,3 +98,7 @@ class Tokenizer13a(BaseTokenizer):
|
|
98 |
line = line.replace(">", ">")
|
99 |
|
100 |
return self._post_tokenizer(f" {line} ")
|
|
|
|
|
|
|
|
|
|
98 |
line = line.replace(">", ">")
|
99 |
|
100 |
return self._post_tokenizer(f" {line} ")
|
101 |
+
|
102 |
+
@lru_cache(maxsize=2**16)
|
103 |
+
def tokenize(self, line):
|
104 |
+
self.__call__(self, line)
|