update distinct
Browse files- distinct.py +7 -7
distinct.py
CHANGED
@@ -146,27 +146,27 @@ class distinct(evaluate.Measurement):
|
|
146 |
total_tokens = []
|
147 |
total_tokens_2grams = []
|
148 |
total_tokens_3grams = []
|
|
|
149 |
for prediction in predictions:
|
150 |
if tokenizer == "white_space":
|
151 |
tokens = prediction.split(" ")
|
152 |
-
tokens_2grams = ngrams(prediction.split(" "), 2, left_pad_symbol='<s>')
|
153 |
-
tokens_3grams = ngrams(prediction.split(" "), 3, left_pad_symbol='<s>')
|
154 |
else:
|
155 |
try:
|
156 |
tokens = list(tokenizer.tokenize(prediction))
|
157 |
-
tokens_2grams = ngrams(list(tokenizer.tokenize(prediction)), 2, left_pad_symbol='<s>')
|
158 |
-
tokens_3grams = ngrams(list(tokenizer.tokenize(prediction)), 3, left_pad_symbol='<s>')
|
159 |
-
|
160 |
except Exception as e:
|
161 |
raise e
|
162 |
-
|
163 |
distinct_tokens = distinct_tokens | set(tokens)
|
164 |
distinct_tokens_2grams = distinct_tokens_2grams | set(tokens_2grams)
|
165 |
distinct_tokens_3grams = distinct_tokens_3grams | set(tokens_3grams)
|
166 |
total_tokens.extend(tokens)
|
167 |
total_tokens_2grams.extend(list(tokens_2grams))
|
168 |
total_tokens_3grams.extend(list(tokens_3grams))
|
169 |
-
|
170 |
Distinct_1 = len(distinct_tokens)/len(total_tokens)
|
171 |
Distinct_2 = len(distinct_tokens_2grams)/len(total_tokens_2grams)
|
172 |
Distinct_3 = len(distinct_tokens_3grams)/len(total_tokens_3grams)
|
|
|
146 |
total_tokens = []
|
147 |
total_tokens_2grams = []
|
148 |
total_tokens_3grams = []
|
149 |
+
|
150 |
for prediction in predictions:
|
151 |
if tokenizer == "white_space":
|
152 |
tokens = prediction.split(" ")
|
153 |
+
tokens_2grams = list(ngrams(prediction.split(" "), 2, pad_left=True, left_pad_symbol='<s>'))
|
154 |
+
tokens_3grams = list(ngrams(prediction.split(" "), 3, pad_left=True, left_pad_symbol='<s>'))
|
155 |
else:
|
156 |
try:
|
157 |
tokens = list(tokenizer.tokenize(prediction))
|
158 |
+
tokens_2grams = list(ngrams(list(tokenizer.tokenize(prediction)), 2, pad_left=True, left_pad_symbol='<s>'))
|
159 |
+
tokens_3grams = list(ngrams(list(tokenizer.tokenize(prediction)), 3, pad_left=True, left_pad_symbol='<s>'))
|
|
|
160 |
except Exception as e:
|
161 |
raise e
|
162 |
+
|
163 |
distinct_tokens = distinct_tokens | set(tokens)
|
164 |
distinct_tokens_2grams = distinct_tokens_2grams | set(tokens_2grams)
|
165 |
distinct_tokens_3grams = distinct_tokens_3grams | set(tokens_3grams)
|
166 |
total_tokens.extend(tokens)
|
167 |
total_tokens_2grams.extend(list(tokens_2grams))
|
168 |
total_tokens_3grams.extend(list(tokens_3grams))
|
169 |
+
|
170 |
Distinct_1 = len(distinct_tokens)/len(total_tokens)
|
171 |
Distinct_2 = len(distinct_tokens_2grams)/len(total_tokens_2grams)
|
172 |
Distinct_3 = len(distinct_tokens_3grams)/len(total_tokens_3grams)
|