Spaces:
Sleeping
Sleeping
add tokenizer config from perplexity metric. truncation breaks tests
Browse files- syntaxgym.py +49 -11
syntaxgym.py
CHANGED
@@ -21,7 +21,7 @@ import datasets
|
|
21 |
import evaluate
|
22 |
import numpy as np
|
23 |
import torch
|
24 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM
|
25 |
|
26 |
from .prediction import Prediction
|
27 |
|
@@ -89,6 +89,46 @@ class SyntaxGymMetricResult(TypedDict):
|
|
89 |
region_totals: List[Dict[Tuple[str, int], float]]
|
90 |
|
91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
93 |
class SyntaxGym(evaluate.EvaluationModule):
|
94 |
"""
|
@@ -110,7 +150,7 @@ class SyntaxGym(evaluate.EvaluationModule):
|
|
110 |
codebase_urls=["https://github.com/cpllab/syntaxgym-core"],
|
111 |
)
|
112 |
|
113 |
-
def _compute(self, suite, model_id, device=None) -> SyntaxGymMetricResult:
|
114 |
if device is not None:
|
115 |
assert device in ["gpu", "cpu", "cuda"]
|
116 |
if device == "gpu":
|
@@ -122,31 +162,31 @@ class SyntaxGym(evaluate.EvaluationModule):
|
|
122 |
model = model.to(device)
|
123 |
model.eval()
|
124 |
|
125 |
-
tokenizer =
|
126 |
-
# TODO copy from perplexity metric
|
127 |
-
tokenizer.pad_token = tokenizer.eos_token
|
128 |
|
129 |
results = {"prediction_results": [], "region_totals": []}
|
130 |
# TODO batch all items together
|
131 |
for item in datasets.logging.tqdm(suite):
|
132 |
-
result_single = self._compute_single(item, tokenizer,
|
|
|
133 |
|
134 |
for k in ["prediction_results", "region_totals"]:
|
135 |
results[k].append(result_single[k])
|
136 |
|
137 |
return results
|
138 |
|
139 |
-
def _compute_single(self, item, tokenizer, model, device):
|
140 |
tokenized = tokenizer(item["conditions"]["content"],
|
141 |
-
padding=True,
|
142 |
return_tensors="pt",
|
143 |
-
return_offsets_mapping=True
|
|
|
144 |
|
145 |
# input_ids: B * T
|
146 |
input_ids = tokenized["input_ids"]
|
147 |
assert input_ids.ndim == 2
|
148 |
|
149 |
# Compute sentence level surprisals.
|
|
|
150 |
with torch.no_grad():
|
151 |
# Pre-softmax predictive distribution B * T * V
|
152 |
logits = model(input_ids).logits
|
@@ -164,8 +204,6 @@ class SyntaxGym(evaluate.EvaluationModule):
|
|
164 |
# reindexed surprisals: B * (T - 1)
|
165 |
surprisals = torch.gather(surps_shifted, 2, expected_ids.unsqueeze(2)) \
|
166 |
.squeeze(2)
|
167 |
-
# This is the original, which works but not with multiple axes in expected_ids
|
168 |
-
# surprisals = surps_shifted[range(surps_shifted.shape[0]), expected_ids]
|
169 |
|
170 |
# surprisals is now B * (T - 1)
|
171 |
|
|
|
21 |
import evaluate
|
22 |
import numpy as np
|
23 |
import torch
|
24 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizer
|
25 |
|
26 |
from .prediction import Prediction
|
27 |
|
|
|
89 |
region_totals: List[Dict[Tuple[str, int], float]]
|
90 |
|
91 |
|
92 |
+
def prepare_tokenizer(model, batch_size, add_start_token=True) -> Tuple[PreTrainedTokenizer, Dict]:
|
93 |
+
"""
|
94 |
+
Load and prepare a tokenizer for SyntaxGym evaluation.
|
95 |
+
|
96 |
+
Returns:
|
97 |
+
tokenizer:
|
98 |
+
tokenizer_kwargs: suggested kwargs for any tokenizer calls
|
99 |
+
"""
|
100 |
+
tokenizer = AutoTokenizer.from_pretrained(model.name_or_path)
|
101 |
+
|
102 |
+
# if batch_size > 1 (which generally leads to padding being required), and
|
103 |
+
# if there is not an already assigned pad_token, assign an existing
|
104 |
+
# special token to also be the padding token
|
105 |
+
if tokenizer.pad_token is None and batch_size > 1:
|
106 |
+
existing_special_tokens = list(tokenizer.special_tokens_map_extended.values())
|
107 |
+
# check that the model already has at least one special token defined
|
108 |
+
assert (
|
109 |
+
len(existing_special_tokens) > 0
|
110 |
+
), "If batch_size > 1, model must have at least one special token to use for padding. Please use a different model or set batch_size=1."
|
111 |
+
# assign one of the special tokens to also be the pad token
|
112 |
+
tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})
|
113 |
+
|
114 |
+
if add_start_token:
|
115 |
+
# leave room for <BOS> token to be added:
|
116 |
+
assert (
|
117 |
+
tokenizer.bos_token is not None
|
118 |
+
), "Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False"
|
119 |
+
max_tokenized_len = model.config.max_length - 1
|
120 |
+
else:
|
121 |
+
max_tokenized_len = model.config.max_length
|
122 |
+
|
123 |
+
tokenizer_kwargs = {
|
124 |
+
"add_special_tokens": False,
|
125 |
+
"padding": True,
|
126 |
+
"truncation": True,
|
127 |
+
"max_length": max_tokenized_len
|
128 |
+
}
|
129 |
+
return tokenizer, tokenizer_kwargs
|
130 |
+
|
131 |
+
|
132 |
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
133 |
class SyntaxGym(evaluate.EvaluationModule):
|
134 |
"""
|
|
|
150 |
codebase_urls=["https://github.com/cpllab/syntaxgym-core"],
|
151 |
)
|
152 |
|
153 |
+
def _compute(self, suite, model_id, batch_size=8, add_start_token=False, device=None) -> SyntaxGymMetricResult:
|
154 |
if device is not None:
|
155 |
assert device in ["gpu", "cpu", "cuda"]
|
156 |
if device == "gpu":
|
|
|
162 |
model = model.to(device)
|
163 |
model.eval()
|
164 |
|
165 |
+
tokenizer, tokenizer_kwargs = prepare_tokenizer(model, batch_size, add_start_token)
|
|
|
|
|
166 |
|
167 |
results = {"prediction_results": [], "region_totals": []}
|
168 |
# TODO batch all items together
|
169 |
for item in datasets.logging.tqdm(suite):
|
170 |
+
result_single = self._compute_single(item, tokenizer, tokenizer_kwargs,
|
171 |
+
model, device)
|
172 |
|
173 |
for k in ["prediction_results", "region_totals"]:
|
174 |
results[k].append(result_single[k])
|
175 |
|
176 |
return results
|
177 |
|
178 |
+
def _compute_single(self, item, tokenizer, tokenizer_kwargs, model, device):
|
179 |
tokenized = tokenizer(item["conditions"]["content"],
|
|
|
180 |
return_tensors="pt",
|
181 |
+
return_offsets_mapping=True,
|
182 |
+
**tokenizer_kwargs).to(device)
|
183 |
|
184 |
# input_ids: B * T
|
185 |
input_ids = tokenized["input_ids"]
|
186 |
assert input_ids.ndim == 2
|
187 |
|
188 |
# Compute sentence level surprisals.
|
189 |
+
# TODO support sentences which exceed truncation length
|
190 |
with torch.no_grad():
|
191 |
# Pre-softmax predictive distribution B * T * V
|
192 |
logits = model(input_ids).logits
|
|
|
204 |
# reindexed surprisals: B * (T - 1)
|
205 |
surprisals = torch.gather(surps_shifted, 2, expected_ids.unsqueeze(2)) \
|
206 |
.squeeze(2)
|
|
|
|
|
207 |
|
208 |
# surprisals is now B * (T - 1)
|
209 |
|