Spaces:
Sleeping
Sleeping
Jon Gauthier
commited on
Commit
·
8cca3d0
1
Parent(s):
8fe0b5d
refactor metric to support evaluating `all-2020` split
Browse files- syntaxgym.py +37 -19
- test/test_syntaxgym.py +18 -1
syntaxgym.py
CHANGED
@@ -187,14 +187,25 @@ class SyntaxGym(evaluate.EvaluationModule):
|
|
187 |
|
188 |
tokenizer, tokenizer_kwargs = prepare_tokenizer(model, batch_size, add_start_token)
|
189 |
|
190 |
-
# Flatten sentences, enforcing that sentences are always ordered by the same condition
|
191 |
-
|
|
|
|
|
|
|
|
|
192 |
all_sentences = []
|
|
|
|
|
|
|
|
|
193 |
for item in dataset:
|
194 |
-
for condition_name in
|
195 |
# Get idx of condition for this item.
|
196 |
condition_idx = item["conditions"]["condition_name"].index(condition_name)
|
|
|
197 |
all_sentences.append(item["conditions"]["content"][condition_idx])
|
|
|
|
|
198 |
|
199 |
# Tokenize sentences and split into batches.
|
200 |
all_tokenized_sentences = tokenizer(all_sentences, return_tensors="pt",
|
@@ -205,7 +216,7 @@ class SyntaxGym(evaluate.EvaluationModule):
|
|
205 |
# Compute surprisal per-batch and combine into a single surprisal tensor.
|
206 |
n_sentences, n_timesteps = all_tokenized_sentences["input_ids"].shape
|
207 |
surprisals = torch.zeros(n_sentences, n_timesteps - 1).float().to(device)
|
208 |
-
for i, batch in enumerate(datasets.logging.tqdm(tokenized_batches)) :
|
209 |
batch = batch.to(device)
|
210 |
with torch.no_grad():
|
211 |
# logits are B * T * V
|
@@ -219,22 +230,29 @@ class SyntaxGym(evaluate.EvaluationModule):
|
|
219 |
|
220 |
surprisals[i * batch_size : (i + 1) * batch_size] = b_surprisals_gt
|
221 |
|
222 |
-
#
|
223 |
-
surprisals = surprisals.reshape((len(dataset), len(condition_order), -1))
|
224 |
-
offset_mapping = all_tokenized_sentences["offset_mapping"] \
|
225 |
-
.reshape((len(dataset), len(condition_order), -1, 2))
|
226 |
-
|
227 |
-
# Now evaluate per-item.
|
228 |
results = {}
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
|
239 |
return results
|
240 |
|
|
|
187 |
|
188 |
tokenizer, tokenizer_kwargs = prepare_tokenizer(model, batch_size, add_start_token)
|
189 |
|
190 |
+
# Flatten sentences, enforcing that sentences are always ordered by the same condition
|
191 |
+
# within-suite.
|
192 |
+
condition_orders = {}
|
193 |
+
for item in dataset:
|
194 |
+
condition_orders[item["suite_name"]] = item["conditions"]["condition_name"]
|
195 |
+
# Flattened batch of sentences
|
196 |
all_sentences = []
|
197 |
+
# Mapping from sentence back to originating suite
|
198 |
+
all_sentence_suites = []
|
199 |
+
# Mapping from item back to originating suite
|
200 |
+
all_item_suites = []
|
201 |
for item in dataset:
|
202 |
+
for condition_name in condition_orders[item["suite_name"]]:
|
203 |
# Get idx of condition for this item.
|
204 |
condition_idx = item["conditions"]["condition_name"].index(condition_name)
|
205 |
+
|
206 |
all_sentences.append(item["conditions"]["content"][condition_idx])
|
207 |
+
all_sentence_suites.append(item["suite_name"])
|
208 |
+
all_item_suites.append(item["suite_name"])
|
209 |
|
210 |
# Tokenize sentences and split into batches.
|
211 |
all_tokenized_sentences = tokenizer(all_sentences, return_tensors="pt",
|
|
|
216 |
# Compute surprisal per-batch and combine into a single surprisal tensor.
|
217 |
n_sentences, n_timesteps = all_tokenized_sentences["input_ids"].shape
|
218 |
surprisals = torch.zeros(n_sentences, n_timesteps - 1).float().to(device)
|
219 |
+
for i, batch in enumerate(datasets.logging.tqdm(tokenized_batches, desc="Computing surprisals", unit="batch")) :
|
220 |
batch = batch.to(device)
|
221 |
with torch.no_grad():
|
222 |
# logits are B * T * V
|
|
|
230 |
|
231 |
surprisals[i * batch_size : (i + 1) * batch_size] = b_surprisals_gt
|
232 |
|
233 |
+
# Aggregate results within-suite
|
|
|
|
|
|
|
|
|
|
|
234 |
results = {}
|
235 |
+
all_sentence_suites = np.array(all_sentence_suites)
|
236 |
+
all_item_suites = np.array(all_item_suites)
|
237 |
+
for suite, condition_order in datasets.logging.tqdm(condition_orders.items(), unit="suite"):
|
238 |
+
suite_sentence_idxs = np.where(all_sentence_suites == suite)[0]
|
239 |
+
suite_item_idxs = np.where(all_item_suites == suite)[0]
|
240 |
+
suite_surprisals = surprisals[suite_sentence_idxs]
|
241 |
+
|
242 |
+
# Reshape to intuitive axes n_items * n_conditions * ...
|
243 |
+
suite_surprisals = suite_surprisals.reshape((len(suite_item_idxs), len(condition_order), -1))
|
244 |
+
suite_offset_mapping = all_tokenized_sentences["offset_mapping"][suite_sentence_idxs] \
|
245 |
+
.reshape((len(suite_item_idxs), len(condition_order), -1, 2))
|
246 |
+
|
247 |
+
# Evaluate per-item
|
248 |
+
suite_result = SyntaxGymMetricSuiteResult(suite, [], [])
|
249 |
+
suite_items = datasets.logging.tqdm([dataset[idx] for idx in suite_item_idxs], unit="item")
|
250 |
+
for item, item_surprisals, item_offset_mapping in zip(suite_items, suite_surprisals, suite_offset_mapping):
|
251 |
+
result_i = self._compute_item(item, item_surprisals, item_offset_mapping, condition_order)
|
252 |
+
suite_result.prediction_results.append(result_i["prediction_results"])
|
253 |
+
suite_result.region_totals.append(result_i["region_totals"])
|
254 |
+
|
255 |
+
results[suite] = suite_result
|
256 |
|
257 |
return results
|
258 |
|
test/test_syntaxgym.py
CHANGED
@@ -513,4 +513,21 @@ def test_gpt_subordination_region_totals(syntaxgym_metric):
|
|
513 |
for region_totals_i in GPT2_SUBORDINATION_SRC_REFERENCE])
|
514 |
pprint(sorted(zip(keys, np.abs(result_ndarray - reference_ndarray)),
|
515 |
key=lambda x: -x[1]))
|
516 |
-
np.testing.assert_allclose(result_ndarray, reference_ndarray, atol=1e-3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
513 |
for region_totals_i in GPT2_SUBORDINATION_SRC_REFERENCE])
|
514 |
pprint(sorted(zip(keys, np.abs(result_ndarray - reference_ndarray)),
|
515 |
key=lambda x: -x[1]))
|
516 |
+
np.testing.assert_allclose(result_ndarray, reference_ndarray, atol=1e-3)
|
517 |
+
|
518 |
+
|
519 |
+
def test_evaluation_all_vs_single(syntaxgym_metric):
|
520 |
+
"""
|
521 |
+
Check that a suite's performance is the same when evaluated in the composite
|
522 |
+
benchmark vs. evaluated independently.
|
523 |
+
"""
|
524 |
+
|
525 |
+
suite_name = "number_prep"
|
526 |
+
full_dataset = datasets.load_dataset("cpllab/syntaxgym")
|
527 |
+
sub_dataset = datasets.load_dataset("cpllab/syntaxgym", suite_name)
|
528 |
+
model_id = "hf-internal-testing/tiny-xlm-roberta"
|
529 |
+
|
530 |
+
full_result = syntaxgym_metric.compute(dataset=full_dataset["test"], model_id=model_id)
|
531 |
+
sub_result = syntaxgym_metric.compute(dataset=sub_dataset["test"], model_id=model_id)
|
532 |
+
|
533 |
+
assert full_result[suite_name].prediction_results == sub_result[suite_name].prediction_results
|