Spaces:

cpllab
/

syntaxgym

Sleeping

App Files Files Community

Jon Gauthier commited on Jan 30, 2023

Commit

8cca3d0

1 Parent(s): 8fe0b5d

refactor metric to support evaluating `all-2020` split

Browse files

Files changed (2) hide show

syntaxgym.py +37 -19
test/test_syntaxgym.py +18 -1

syntaxgym.py CHANGED Viewed

@@ -187,14 +187,25 @@ class SyntaxGym(evaluate.EvaluationModule):
         tokenizer, tokenizer_kwargs = prepare_tokenizer(model, batch_size, add_start_token)
-        # Flatten sentences, enforcing that sentences are always ordered by the same condition.
-        condition_order = dataset[0]["conditions"]["condition_name"]
         all_sentences = []
         for item in dataset:
-            for condition_name in condition_order:
                 # Get idx of condition for this item.
                 condition_idx = item["conditions"]["condition_name"].index(condition_name)
                 all_sentences.append(item["conditions"]["content"][condition_idx])
         # Tokenize sentences and split into batches.
         all_tokenized_sentences = tokenizer(all_sentences, return_tensors="pt",
@@ -205,7 +216,7 @@ class SyntaxGym(evaluate.EvaluationModule):
         # Compute surprisal per-batch and combine into a single surprisal tensor.
         n_sentences, n_timesteps = all_tokenized_sentences["input_ids"].shape
         surprisals = torch.zeros(n_sentences, n_timesteps - 1).float().to(device)
-        for i, batch in enumerate(datasets.logging.tqdm(tokenized_batches)) :
             batch = batch.to(device)
             with torch.no_grad():
                 # logits are B * T * V
@@ -219,22 +230,29 @@ class SyntaxGym(evaluate.EvaluationModule):
             surprisals[i * batch_size : (i + 1) * batch_size] = b_surprisals_gt
-        # Reshape to intuitive axes n_items * n_conditions * ...
-        surprisals = surprisals.reshape((len(dataset), len(condition_order), -1))
-        offset_mapping = all_tokenized_sentences["offset_mapping"] \
-            .reshape((len(dataset), len(condition_order), -1, 2))
-        # Now evaluate per-item.
         results = {}
-        result_keys = ["prediction_results", "region_totals"]
-        for item, item_surprisals, item_offset_mapping in zip(datasets.logging.tqdm(dataset), surprisals, offset_mapping):
-            result_i = self._compute_item(item, item_surprisals, item_offset_mapping, condition_order)
-            suite_name = item["suite_name"]
-            if suite_name not in results:
-                results[suite_name] = SyntaxGymMetricSuiteResult(suite_name, [], [])
-            for k in result_keys:
-                getattr(results[suite_name], k).append(result_i[k])
         return results

         tokenizer, tokenizer_kwargs = prepare_tokenizer(model, batch_size, add_start_token)
+        # Flatten sentences, enforcing that sentences are always ordered by the same condition
+        # within-suite.
+        condition_orders = {}
+        for item in dataset:
+            condition_orders[item["suite_name"]] = item["conditions"]["condition_name"]
+        # Flattened batch of sentences
         all_sentences = []
+        # Mapping from sentence back to originating suite
+        all_sentence_suites = []
+        # Mapping from item back to originating suite
+        all_item_suites = []
         for item in dataset:
+            for condition_name in condition_orders[item["suite_name"]]:
                 # Get idx of condition for this item.
                 condition_idx = item["conditions"]["condition_name"].index(condition_name)
                 all_sentences.append(item["conditions"]["content"][condition_idx])
+                all_sentence_suites.append(item["suite_name"])
+            all_item_suites.append(item["suite_name"])
         # Tokenize sentences and split into batches.
         all_tokenized_sentences = tokenizer(all_sentences, return_tensors="pt",
         # Compute surprisal per-batch and combine into a single surprisal tensor.
         n_sentences, n_timesteps = all_tokenized_sentences["input_ids"].shape
         surprisals = torch.zeros(n_sentences, n_timesteps - 1).float().to(device)
+        for i, batch in enumerate(datasets.logging.tqdm(tokenized_batches, desc="Computing surprisals", unit="batch")) :
             batch = batch.to(device)
             with torch.no_grad():
                 # logits are B * T * V
             surprisals[i * batch_size : (i + 1) * batch_size] = b_surprisals_gt
+        # Aggregate results within-suite
         results = {}
+        all_sentence_suites = np.array(all_sentence_suites)
+        all_item_suites = np.array(all_item_suites)
+        for suite, condition_order in datasets.logging.tqdm(condition_orders.items(), unit="suite"):
+            suite_sentence_idxs = np.where(all_sentence_suites == suite)[0]
+            suite_item_idxs = np.where(all_item_suites == suite)[0]
+            suite_surprisals = surprisals[suite_sentence_idxs]
+            # Reshape to intuitive axes n_items * n_conditions * ...
+            suite_surprisals = suite_surprisals.reshape((len(suite_item_idxs), len(condition_order), -1))
+            suite_offset_mapping = all_tokenized_sentences["offset_mapping"][suite_sentence_idxs] \
+                .reshape((len(suite_item_idxs), len(condition_order), -1, 2))
+            # Evaluate per-item
+            suite_result = SyntaxGymMetricSuiteResult(suite, [], [])
+            suite_items = datasets.logging.tqdm([dataset[idx] for idx in suite_item_idxs], unit="item")
+            for item, item_surprisals, item_offset_mapping in zip(suite_items, suite_surprisals, suite_offset_mapping):
+                result_i = self._compute_item(item, item_surprisals, item_offset_mapping, condition_order)
+                suite_result.prediction_results.append(result_i["prediction_results"])
+                suite_result.region_totals.append(result_i["region_totals"])
+            results[suite] = suite_result
         return results

test/test_syntaxgym.py CHANGED Viewed

@@ -513,4 +513,21 @@ def test_gpt_subordination_region_totals(syntaxgym_metric):
                                         for region_totals_i in GPT2_SUBORDINATION_SRC_REFERENCE])
     pprint(sorted(zip(keys, np.abs(result_ndarray - reference_ndarray)),
                   key=lambda x: -x[1]))
-    np.testing.assert_allclose(result_ndarray, reference_ndarray, atol=1e-3)

                                         for region_totals_i in GPT2_SUBORDINATION_SRC_REFERENCE])
     pprint(sorted(zip(keys, np.abs(result_ndarray - reference_ndarray)),
                   key=lambda x: -x[1]))
+    np.testing.assert_allclose(result_ndarray, reference_ndarray, atol=1e-3)
+def test_evaluation_all_vs_single(syntaxgym_metric):
+    """
+    Check that a suite's performance is the same when evaluated in the composite
+    benchmark vs. evaluated independently.
+    """
+    suite_name = "number_prep"
+    full_dataset = datasets.load_dataset("cpllab/syntaxgym")
+    sub_dataset = datasets.load_dataset("cpllab/syntaxgym", suite_name)
+    model_id = "hf-internal-testing/tiny-xlm-roberta"
+    full_result = syntaxgym_metric.compute(dataset=full_dataset["test"], model_id=model_id)
+    sub_result = syntaxgym_metric.compute(dataset=sub_dataset["test"], model_id=model_id)
+    assert full_result[suite_name].prediction_results == sub_result[suite_name].prediction_results