Spaces:

cpllab
/

syntaxgym

Sleeping

App Files Files Community

jgauthier commited on Jun 9, 2022

Commit

bcb8ccf

1 Parent(s): 9a11e1b

move metric and tests from dataset repo

Browse files

Files changed (4) hide show

.gitignore +1 -0
prediction.py +235 -0
syntaxgym.py +225 -52
test.py +516 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

prediction.py ADDED Viewed

	@@ -0,0 +1,235 @@

+from typing import Union, Optional as TOptional, List as TList
+from pyparsing import *
+import numpy as np
+METRICS = {
+    'sum': sum,
+    'mean': np.mean,
+    'median': np.median,
+    'range': np.ptp,
+    'max': max,
+    'min': min
+}
+# Enable parser packrat (caching)
+ParserElement.enablePackrat()
+# Relative and absolute tolerance thresholds for surprisal equality
+EQUALITY_RTOL = 1e-5
+EQUALITY_ATOL = 1e-3
+#######
+# Define a grammar for prediction formulae.
+# References a surprisal region
+lpar = Suppress("(")
+rpar = Suppress(")")
+region = lpar + (Word(nums) | "*") + Suppress(";%") + Word(alphanums + "_-") + Suppress("%") + rpar
+literal_float = pyparsing_common.number
+class Region(object):
+    def __init__(self, tokens):
+        self.region_number = tokens[0]
+        self.condition_name = tokens[1]
+    def __str__(self):
+        return "(%s;%%%s%%)" % (self.region_number, self.condition_name)
+    def __repr__(self):
+        return "Region(%s,%s)" % (self.condition_name, self.region_number)
+    def __call__(self, surprisal_dict):
+        if self.region_number == "*":
+            return sum(value for (condition, region), value in surprisal_dict.items()
+                       if condition == self.condition_name)
+        return surprisal_dict[self.condition_name, int(self.region_number)]
+class LiteralFloat(object):
+    def __init__(self, tokens):
+        self.value = float(tokens[0])
+    def __str__(self):
+        return "%f" % (self.value,)
+    def __repr__(self):
+        return "LiteralFloat(%f)" % (self.value,)
+    def __call__(self, surprisal_dict):
+        return self.value
+class BinaryOp(object):
+    operators: TOptional[TList[str]]
+    def __init__(self, tokens):
+        self.operator = tokens[0][1]
+        if self.operators is not None and self.operator not in self.operators:
+            raise ValueError("Invalid %s operator %s" % (self.__class__.__name__,
+                                                            self.operator))
+        self.operands = [tokens[0][0], tokens[0][2]]
+    def __str__(self):
+        return "(%s %s %s)" % (self.operands[0], self.operator, self.operands[1])
+    def __repr__(self):
+        return "%s(%s)(%s)" % (self.__class__.__name__, self.operator, ",".join(map(repr, self.operands)))
+    def __call__(self, surprisal_dict):
+        op_vals = [op(surprisal_dict) for op in self.operands]
+        return self._evaluate(op_vals, surprisal_dict)
+    def _evaluate(self, evaluated_operands, surprisal_dict):
+        raise NotImplementedError()
+class BoolOp(BinaryOp):
+    operators = ["&", "|"]
+    def _evaluate(self, op_vals, surprisal_dict):
+        if self.operator == "&":
+            return op_vals[0] and op_vals[1]
+        elif self.operator == "|":
+            return op_vals[0] or op_vals[1]
+class FloatOp(BinaryOp):
+    operators = ["-", "+"]
+    def _evaluate(self, op_vals, surprisal_dict):
+        if self.operator == "-":
+            return op_vals[0] - op_vals[1]
+        elif self.operator == "+":
+            return op_vals[0] + op_vals[1]
+class ComparatorOp(BinaryOp):
+    operators = ["<", ">", "="]
+    def _evaluate(self, op_vals, surprisal_dict):
+        if self.operator == "<":
+            return op_vals[0] < op_vals[1]
+        elif self.operator == ">":
+            return op_vals[0] > op_vals[1]
+        elif self.operator == "=":
+            return np.isclose(op_vals[0], op_vals[1],
+                                rtol=EQUALITY_RTOL,
+                                atol=EQUALITY_ATOL)
+def Chain(op_cls, left_assoc=True):
+    def chainer(tokens):
+        """
+        Create a binary tree of BinaryOps from the given repeated application
+        of the op.
+        """
+        operators = tokens[0][1::2]
+        args = tokens[0][0::2]
+        if not left_assoc:
+            raise NotImplementedError
+        arg1 = args.pop(0)
+        while len(args) > 0:
+            operator = operators.pop(0)
+            arg2 = args.pop(0)
+            arg1 = op_cls([[arg1, operator, arg2]])
+        return arg1
+    return chainer
+atom = region.setParseAction(Region) | literal_float.setParseAction(LiteralFloat)
+prediction_expr = infixNotation(
+    atom,
+    [
+        (oneOf("- +"), 2, opAssoc.LEFT, Chain(FloatOp)),
+        (oneOf("< > ="), 2, opAssoc.LEFT, ComparatorOp),
+        (oneOf("& |"), 2, opAssoc.LEFT, Chain(BoolOp)),
+    ],
+    lpar=lpar, rpar=rpar
+)
+class Prediction(object):
+    """
+    Predictions state expected relations between language model surprisal
+    measures in different regions and conditions of a test suite. For more
+    information, see :ref:`architecture`.
+    """
+    def __init__(self, idx: int, formula: Union[str, BinaryOp], metric: str):
+        """
+        Args:
+            idx: A unique prediction ID. This is only relevant for
+                serialization.
+            formula: A string representation of the prediction formula, or an
+                already parsed formula. For more information, see
+                :ref:`architecture`.
+            metric: Metric for aggregating surprisals within regions.
+        """
+        if isinstance(formula, str):
+            try:
+                formula = prediction_expr.parseString(formula, parseAll=True)[0]
+            except ParseException as e:
+                raise ValueError("Invalid formula expression %r" % (formula,)) from e
+        self.idx = idx
+        self.formula = formula
+        if metric not in METRICS.keys():
+            raise ValueError("Unknown metric %s. Supported metrics: %s" %
+                             (metric, " ".join(METRICS.keys())))
+        self.metric = metric
+    def __call__(self, item):
+        """
+        Evaluate the prediction on the given item dict representation. For more
+        information on item representations, see :ref:`suite_json`.
+        """
+        # Prepare relevant surprisal dict
+        surps = {(c["condition_name"], r["region_number"]): r["metric_value"][self.metric]
+                 for c in item["conditions"]
+                 for r in c["regions"]}
+        return self.formula(surps)
+    @classmethod
+    def from_dict(cls, pred_dict, idx: int, metric: str):
+        """
+        Parse from a prediction dictionary representation (see
+        :ref:`suite_json`).
+        """
+        if not pred_dict["type"] == "formula":
+            raise ValueError("Unknown prediction type %s" % (pred_dict["type"],))
+        return cls(formula=pred_dict["formula"], idx=idx, metric=metric)
+    @property
+    def referenced_regions(self):
+        """
+        Get a set of the regions referenced by this formula.
+        Each item is a tuple of the form ``(condition_name, region_number)``.
+        """
+        def traverse(x, acc):
+            if isinstance(x, BinaryOp):
+                for val in x.operands:
+                    traverse(val, acc)
+            elif isinstance(x, Region):
+                acc.add((x.condition_name, int(x.region_number)))
+            return acc
+        return traverse(self.formula, set())
+    def as_dict(self):
+        """
+        Serialize as a prediction dictionary representation (see
+        :ref:`suite_json`).
+        """
+        return dict(type="formula", formula=str(self.formula))
+    def __str__(self):
+        return "Prediction(%s)" % (self.formula,)
+    __repr__ = __str__
+    def __hash__(self):
+        return hash(self.formula)
+    def __eq__(self, other):
+        return isinstance(other, Prediction) and hash(self) == hash(other)

syntaxgym.py CHANGED Viewed

@@ -13,83 +13,256 @@
 # limitations under the License.
 """TODO: Add a description here."""
-import evaluate
 import datasets
-# TODO: Add BibTeX citation
 _CITATION = """\
-@InProceedings{huggingface:module,
-title = {A great new module},
-authors={huggingface, Inc.},
-year={2020}
 }
 """
 # TODO: Add description of the module here
-_DESCRIPTION = """\
-This new module is designed to solve this great ML task and is crafted with a lot of care.
 """
 # TODO: Add description of the arguments of the module here
 _KWARGS_DESCRIPTION = """
-Calculates how good are predictions given some references, using certain scores
 Args:
-    predictions: list of predictions to score. Each predictions
-        should be a string with tokens separated by spaces.
-    references: list of reference for each prediction. Each
-        reference should be a string with tokens separated by spaces.
 Returns:
-    accuracy: description of the first score,
-    another_score: description of the second score,
 Examples:
-    Examples should be written in doctest format, and should illustrate how
-    to use the function.
-    >>> my_new_module = evaluate.load("my_new_module")
-    >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
-    >>> print(results)
-    {'accuracy': 1.0}
 """
-# TODO: Define external resources urls if needed
-BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
-class {{ cookiecutter.module_class_name }}(evaluate.EvaluationModule):
-    """TODO: Short description of my evaluation module."""
     def _info(self):
-        # TODO: Specifies the evaluate.EvaluationModuleInfo object
         return evaluate.EvaluationModuleInfo(
-            # This is the description that will appear on the modules page.
-            module_type="{{ cookiecutter.module_type }}",
-            description=_DESCRIPTION,
             citation=_CITATION,
-            inputs_description=_KWARGS_DESCRIPTION,
-            # This defines the format of each prediction and reference
-            features=datasets.Features({
-                'predictions': datasets.Value('int64'),
-                'references': datasets.Value('int64'),
-            }),
-            # Homepage of the module for documentation
-            homepage="http://module.homepage",
-            # Additional links to the codebase or references
-            codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
-            reference_urls=["http://path.to.reference.url/new_module"]
         )
-    def _download_and_prepare(self, dl_manager):
-        """Optional: download external resources useful to compute the scores"""
-        # TODO: Download external resources if needed
-        pass
-    def _compute(self, predictions, references):
-        """Returns the scores"""
-        # TODO: Compute the different scores of the module
-        accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
-        return {
-            "accuracy": accuracy,
-        }

 # limitations under the License.
 """TODO: Add a description here."""
+from collections import defaultdict
+from typing import List, Dict, Tuple
+from typing_extensions import TypedDict
 import datasets
+import evaluate
+import numpy as np
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from .prediction import Prediction
 _CITATION = """\
+@inproceedings{Hu:et-al:2020,
+  author = {Hu, Jennifer and Gauthier, Jon and Qian, Peng and Wilcox, Ethan and Levy, Roger},
+  title = {A systematic assessment of syntactic generalization in neural language models},
+  booktitle = {Proceedings of the Association of Computational Linguistics},
+  year = {2020}
 }
 """
 # TODO: Add description of the module here
+_DESCRIPTION = """
 """
 # TODO: Add description of the arguments of the module here
 _KWARGS_DESCRIPTION = """
+Runs SyntaxGym evaluations on the given model and test suite.
 Args:
+    suite (Dataset): SyntaxGym test suite loaded as a Dataset.
+    model_id (str): model used for calculating surprisals
+            NOTE: The SyntaxGym evaluations are only well-defined for causal language models.
+                    This includes models such as gpt2, causal variations of bert,
+                    causal versions of t5, and more (the full list can be found
+                    in the AutoModelForCausalLM documentation here:
+                    https://huggingface.co/docs/transformers/master/en/model_doc/auto#transformers.AutoModelForCausalLM )
 Returns:
+    prediction_results: A list of prediction results per item. A list of lists,
+            one per item, containing the boolean prediction result for each
+            prediction in the test suite,
+    region_totals: A list of total surprisals for each region (nested within
+            condition and item). A list of dictionaries (one per item), each
+            mapping tuples (condition_name, region_number) to a float
+            total surprisal value (i.e. negative log-2 probability).
 Examples:
+    TODO
+    >>> my_new_module = evaluate.load("cpllab/syntaxgym")
+    >>> ...
 """
+SUITE_DATASET_CONDITION_SPEC = {
+    "condition_name": datasets.Value("string"),
+    "content": datasets.Value("string"),
+    "regions": datasets.Sequence({
+        "region_number": datasets.Value("int32"),
+        "content": datasets.Value("string")
+    })
+}
+SUITE_DATASET_SPEC = {
+    "item_number": datasets.Value("int32"),
+    "conditions": datasets.Sequence(SUITE_DATASET_CONDITION_SPEC),
+    "predictions": datasets.Sequence(datasets.Value("string")),
+}
+class SyntaxGymMetricResult(TypedDict):
+    prediction_results: List[List[bool]]
+    region_totals: List[Dict[Tuple[str, int], float]]
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class SyntaxGym(evaluate.EvaluationModule):
+    """
+    Defines SyntaxGym evaluation logic for causal language models.
+    """
     def _info(self):
+        seq = datasets.Sequence
+        features = datasets.Features({
+            "suite": SUITE_DATASET_SPEC
+        })
         return evaluate.EvaluationModuleInfo(
+            module_type="metric",
+            description="TODO",
             citation=_CITATION,
+            inputs_description="TODO",
+            features=features,
+            homepage="https://syntaxgym.org",
+            codebase_urls=["https://github.com/cpllab/syntaxgym-core"],
         )
+    def _compute(self, suite, model_id, device=None) -> SyntaxGymMetricResult:
+        if device is not None:
+            assert device in ["gpu", "cpu", "cuda"]
+            if device == "gpu":
+                device = "cuda"
+        else:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+        model = AutoModelForCausalLM.from_pretrained(model_id)
+        model = model.to(device)
+        model.eval()
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        # TODO copy from perplexity metric
+        tokenizer.pad_token = tokenizer.eos_token
+        results = {"prediction_results": [], "region_totals": []}
+        # TODO batch all items together
+        for item in datasets.logging.tqdm(suite):
+            result_single = self._compute_single(item, tokenizer, model, device)
+            for k in ["prediction_results", "region_totals"]:
+                results[k].append(result_single[k])
+        return results
+    def _compute_single(self, item, tokenizer, model, device):
+        tokenized = tokenizer(item["conditions"]["content"],
+                              padding=True,
+                              return_tensors="pt",
+                              return_offsets_mapping=True).to(device)
+        # input_ids: B * T
+        input_ids = tokenized["input_ids"]
+        assert input_ids.ndim == 2
+        # Compute sentence level surprisals.
+        with torch.no_grad():
+            # Pre-softmax predictive distribution B * T * V
+            logits = model(input_ids).logits
+            surprisals = -logits.log_softmax(dim=2) / np.log(2)
+        # surprisals: B * T * V
+        assert surprisals.ndim == 3
+        # Get surprisals of expected words.
+        surps_shifted = surprisals[:, :-1, :]
+        expected_ids = input_ids[:, 1:]
+        # TODO: check this logic
+        tt = expected_ids.unsqueeze(2)
+        # reindexed surprisals: B * (T - 1)
+        surprisals = torch.gather(surps_shifted, 2, expected_ids.unsqueeze(2)) \
+            .squeeze(2)
+        # This is the original, which works but not with multiple axes in expected_ids
+        # surprisals = surps_shifted[range(surps_shifted.shape[0]), expected_ids]
+        # surprisals is now B * (T - 1)
+        #### aggregate
+        condition_names = item["conditions"]["condition_name"]
+        region_totals = {condition_name: defaultdict(float)
+                         for condition_name in condition_names}
+        region2tokens = self.compute_region_token_mapping(
+            item, input_ids, tokenized["offset_mapping"])
+        for i, (i_cond, i_inputs) in enumerate(zip(condition_names, input_ids)):
+            for region_number, region_tokens in region2tokens[i_cond].items():
+                for token in region_tokens:
+                    if token == 0:
+                        # surprisal not defined. pass.
+                        continue
+                    elif token <= surprisals.shape[1]:
+                        region_totals[i_cond][region_number] += surprisals[i, token - 1]
+                    else:
+                        # TODO don't think this is an issue, just should clean
+                        # up the aggregation output
+                        assert token == surprisals.shape[1], \
+                            "%s %s" % (token, surprisals.shape[1])
+        region_totals = {(condition_name, region_number): float(total)
+                         for condition_name, totals in region_totals.items()
+                         for region_number, total in totals.items()}
+        results = {
+            "prediction_results": [
+                Prediction(i, formula, "sum").formula(region_totals)
+                for i, formula in enumerate(item["predictions"])
+            ],
+            "region_totals": region_totals
+        }
+        return results
+    def get_region_edges(self, item, condition_idx):
+        """
+        Get left edge of each region as a character index.
+        """
+        # NB this is coupled with `condition_to_string` logic of course
+        regions = item["conditions"]["regions"][condition_idx]
+        idx = 0
+        ret = []
+        for r_idx, region_content in enumerate(regions["content"]):
+            ret.append(idx)
+            region_size = len(region_content)
+            if region_content.strip() != "" and r_idx != 0 and not region_content.startswith(","):
+                # Add joining space
+                region_size += 1
+            idx += region_size
+        return ret
+    def compute_region_token_mapping(self, item, input_ids: torch.LongTensor,
+                                     offset_mapping: List[Tuple[int, int]]
+                                     ) -> Dict[str, Dict[int, List[int]]]:
+        # input_ids: B * T
+        # offset_mapping: B * T * 2
+        # assumes batch is sorted according to item's condition_name order
+        condition_names = item["conditions"]["condition_name"]
+        region2tokens = {cond: defaultdict(list) for cond in condition_names}
+        max_long = torch.iinfo(torch.int64).max
+        input_ids = input_ids.detach()
+        for i_cond, (i_tokens, i_offsets) in enumerate(zip(input_ids, offset_mapping)):
+            region_edges = self.get_region_edges(item, i_cond)
+            t_cursor, r_cursor = 0, 0
+            while t_cursor < i_tokens.shape[0]:
+                # token = i_tokens[t_cursor]
+                token_char_start, token_char_end = i_offsets[t_cursor]
+                if token_char_start == token_char_end == 0:
+                    # This is a padding token. Skip.
+                    # TODO what about BOS/EOS? some models incorporate them
+                    t_cursor += 1
+                    continue
+                region_start = region_edges[r_cursor]
+                region_end = region_edges[r_cursor + 1] \
+                    if r_cursor + 1 < len(region_edges) else max_long
+                # NB region boundaries are left edges, hence the >= here.
+                if token_char_start >= region_end:
+                    r_cursor += 1
+                    continue
+                region2tokens[condition_names[i_cond]][r_cursor + 1].append(t_cursor)
+                t_cursor += 1
+        return region2tokens

test.py ADDED Viewed

	@@ -0,0 +1,516 @@

+from typing import List
+import datasets
+import evaluate
+import numpy as np
+import pytest
+@pytest.fixture(scope="session")
+def syntaxgym_dataset():
+    return datasets.load_dataset("syntaxgym", "subordination_src-src")
+@pytest.fixture(scope="session")
+def syntaxgym_metric():
+    return evaluate.load("./syntaxgym.py")
+@pytest.fixture(scope="session")
+def model_ref():
+    # return "hf-internal-testing/tiny-random-gpt_neo"
+    return "gpt2"
+# Reference region surprisals computed with syntaxgym-core.
+# See notebook in https://colab.research.google.com/drive/1qziyPcu65jffizSPi-ZGHKR0x7BaHFMS#scrollTo=RgtnScy6LLKi .
+GPT2_SUBORDINATION_SRC_REFERENCE = \
+[{('no-sub_matrix', 1): 13.151199615123803,
+  ('no-sub_matrix', 2): 38.503222716703526,
+  ('no-sub_matrix', 3): 27.623861034812286,
+  ('no-sub_matrix', 4): 48.831672846038224,
+  ('no-sub_matrix', 5): 38.08533699286694,
+  ('no-sub_no-matrix', 1): 13.151199615123803,
+  ('no-sub_no-matrix', 2): 38.503222716703526,
+  ('no-sub_no-matrix', 3): 27.623861034812286,
+  ('no-sub_no-matrix', 4): 48.831687980511504,
+  ('no-sub_no-matrix', 5): 1.8096143510772873,
+  ('sub_matrix', 1): 14.905592916748805,
+  ('sub_matrix', 2): 39.06304309956175,
+  ('sub_matrix', 3): 26.862648365854433,
+  ('sub_matrix', 4): 50.56554401687938,
+  ('sub_matrix', 5): 26.532245572980194,
+  ('sub_no-matrix', 1): 14.905592916748805,
+  ('sub_no-matrix', 2): 39.06304309956175,
+  ('sub_no-matrix', 3): 26.862648365854433,
+  ('sub_no-matrix', 4): 50.56553438585093,
+  ('sub_no-matrix', 5): 7.470089829866611},
+ {('no-sub_matrix', 1): 10.116093820255577,
+  ('no-sub_matrix', 2): 20.96513246705127,
+  ('no-sub_matrix', 3): 20.02959138986416,
+  ('no-sub_matrix', 4): 23.779661397107446,
+  ('no-sub_matrix', 5): 33.2560281692696,
+  ('no-sub_no-matrix', 1): 10.116093820255577,
+  ('no-sub_no-matrix', 2): 20.96513246705127,
+  ('no-sub_no-matrix', 3): 20.02959138986416,
+  ('no-sub_no-matrix', 4): 23.779661397107446,
+  ('no-sub_no-matrix', 5): 1.9449125865631063,
+  ('sub_matrix', 1): 13.545157521732826,
+  ('sub_matrix', 2): 24.96048395897244,
+  ('sub_matrix', 3): 18.609464944317324,
+  ('sub_matrix', 4): 23.057566440062317,
+  ('sub_matrix', 5): 26.424454285669032,
+  ('sub_no-matrix', 1): 13.545157521732826,
+  ('sub_no-matrix', 2): 24.96048395897244,
+  ('sub_no-matrix', 3): 18.609464944317324,
+  ('sub_no-matrix', 4): 23.057566440062317,
+  ('sub_no-matrix', 5): 2.807467838359704},
+ {('no-sub_matrix', 1): 11.992867568477442,
+  ('no-sub_matrix', 2): 45.813114232935774,
+  ('no-sub_matrix', 3): 24.57554828372551,
+  ('no-sub_matrix', 4): 45.334025774062916,
+  ('no-sub_matrix', 5): 26.208189541862073,
+  ('no-sub_no-matrix', 1): 11.992867568477442,
+  ('no-sub_no-matrix', 2): 45.813114232935774,
+  ('no-sub_no-matrix', 3): 24.57554828372551,
+  ('no-sub_no-matrix', 4): 45.33402766587207,
+  ('no-sub_no-matrix', 5): 1.8284485151385752,
+  ('sub_matrix', 1): 14.219887768799735,
+  ('sub_matrix', 2): 46.25055434117979,
+  ('sub_matrix', 3): 23.054221678472672,
+  ('sub_matrix', 4): 47.08503858470256,
+  ('sub_matrix', 5): 22.154772321452022,
+  ('sub_no-matrix', 1): 14.219887768799735,
+  ('sub_no-matrix', 2): 46.25055434117979,
+  ('sub_no-matrix', 3): 23.054221678472672,
+  ('sub_no-matrix', 4): 47.08503858470256,
+  ('sub_no-matrix', 5): 3.0655133594366757},
+ {('no-sub_matrix', 1): 10.55002943802296,
+  ('no-sub_matrix', 2): 52.419810137608856,
+  ('no-sub_matrix', 3): 23.30710475332303,
+  ('no-sub_matrix', 4): 37.957905964008944,
+  ('no-sub_matrix', 5): 29.259648135104936,
+  ('no-sub_no-matrix', 1): 10.55002943802296,
+  ('no-sub_no-matrix', 2): 52.419810137608856,
+  ('no-sub_no-matrix', 3): 23.30710475332303,
+  ('no-sub_no-matrix', 4): 37.957905964008944,
+  ('no-sub_no-matrix', 5): 1.9632913405649093,
+  ('sub_matrix', 1): 15.289384584900025,
+  ('sub_matrix', 2): 53.93652737134243,
+  ('sub_matrix', 3): 19.43915835312633,
+  ('sub_matrix', 4): 36.459591551099386,
+  ('sub_matrix', 5): 22.185742699245417,
+  ('sub_no-matrix', 1): 15.289384584900025,
+  ('sub_no-matrix', 2): 53.93652737134243,
+  ('sub_no-matrix', 3): 19.43915835312633,
+  ('sub_no-matrix', 4): 36.4595598203003,
+  ('sub_no-matrix', 5): 5.707732355645454},
+ {('no-sub_matrix', 1): 23.543723213902986,
+  ('no-sub_matrix', 2): 31.967972102825854,
+  ('no-sub_matrix', 3): 29.159572978411727,
+  ('no-sub_matrix', 4): 36.61365345925747,
+  ('no-sub_matrix', 5): 44.576591305970545,
+  ('no-sub_no-matrix', 1): 23.543723213902986,
+  ('no-sub_no-matrix', 2): 31.967972102825854,
+  ('no-sub_no-matrix', 3): 29.159572978411727,
+  ('no-sub_no-matrix', 4): 36.61365345925747,
+  ('no-sub_no-matrix', 5): 3.2813457388593714,
+  ('sub_matrix', 1): 27.118410129310597,
+  ('sub_matrix', 2): 33.909617362987866,
+  ('sub_matrix', 3): 28.791166362258743,
+  ('sub_matrix', 4): 37.24960609010374,
+  ('sub_matrix', 5): 31.660933798006262,
+  ('sub_no-matrix', 1): 27.118410129310597,
+  ('sub_no-matrix', 2): 33.909617362987866,
+  ('sub_no-matrix', 3): 28.791166362258743,
+  ('sub_no-matrix', 4): 37.24960609010374,
+  ('sub_no-matrix', 5): 7.3613541428239015},
+ {('no-sub_matrix', 1): 14.22171869610082,
+  ('no-sub_matrix', 2): 30.270423022911977,
+  ('no-sub_matrix', 3): 25.973276891204705,
+  ('no-sub_matrix', 4): 28.43856735947716,
+  ('no-sub_matrix', 5): 57.39887418731055,
+  ('no-sub_no-matrix', 1): 14.22171869610082,
+  ('no-sub_no-matrix', 2): 30.270423022911977,
+  ('no-sub_no-matrix', 3): 25.973276891204705,
+  ('no-sub_no-matrix', 4): 28.43856735947716,
+  ('no-sub_no-matrix', 5): 1.7127059109344136,
+  ('sub_matrix', 1): 16.39289784951447,
+  ('sub_matrix', 2): 31.5671111565765,
+  ('sub_matrix', 3): 24.54307828171008,
+  ('sub_matrix', 4): 29.249645624130757,
+  ('sub_matrix', 5): 53.59155769093577,
+  ('sub_no-matrix', 1): 16.39289784951447,
+  ('sub_no-matrix', 2): 31.5671111565765,
+  ('sub_no-matrix', 3): 24.54307828171008,
+  ('sub_no-matrix', 4): 29.249645624130757,
+  ('sub_no-matrix', 5): 7.225276653947023},
+ {('no-sub_matrix', 1): 13.729688714733188,
+  ('no-sub_matrix', 2): 36.018118127225165,
+  ('no-sub_matrix', 3): 28.232055923783275,
+  ('no-sub_matrix', 4): 44.44634394296659,
+  ('no-sub_matrix', 5): 38.277975147059344,
+  ('no-sub_no-matrix', 1): 13.729688714733188,
+  ('no-sub_no-matrix', 2): 36.018118127225165,
+  ('no-sub_no-matrix', 3): 28.232055923783275,
+  ('no-sub_no-matrix', 4): 44.44634394296659,
+  ('no-sub_no-matrix', 5): 3.0318996942908414,
+  ('sub_matrix', 1): 16.93528744674245,
+  ('sub_matrix', 2): 36.545024814326574,
+  ('sub_matrix', 3): 26.279603445823692,
+  ('sub_matrix', 4): 46.501226364074995,
+  ('sub_matrix', 5): 32.155418057793035,
+  ('sub_no-matrix', 1): 16.93528744674245,
+  ('sub_no-matrix', 2): 36.545024814326574,
+  ('sub_no-matrix', 3): 26.279603445823692,
+  ('sub_no-matrix', 4): 46.501226364074995,
+  ('sub_no-matrix', 5): 4.4581122618864155},
+ {('no-sub_matrix', 1): 15.598113737151568,
+  ('no-sub_matrix', 2): 56.12543415244172,
+  ('no-sub_matrix', 3): 29.755667770007285,
+  ('no-sub_matrix', 4): 51.689282097269995,
+  ('no-sub_matrix', 5): 45.575230324010775,
+  ('no-sub_no-matrix', 1): 15.598113737151568,
+  ('no-sub_no-matrix', 2): 56.12543415244172,
+  ('no-sub_no-matrix', 3): 29.755667770007285,
+  ('no-sub_no-matrix', 4): 51.68928424705313,
+  ('no-sub_no-matrix', 5): 1.235207173694806,
+  ('sub_matrix', 1): 18.909088991066888,
+  ('sub_matrix', 2): 57.753410746636746,
+  ('sub_matrix', 3): 28.677667873674363,
+  ('sub_matrix', 4): 51.99410775929489,
+  ('sub_matrix', 5): 35.754144966112236,
+  ('sub_no-matrix', 1): 18.909088991066888,
+  ('sub_no-matrix', 2): 57.753410746636746,
+  ('sub_no-matrix', 3): 28.677667873674363,
+  ('sub_no-matrix', 4): 51.9941480032352,
+  ('sub_no-matrix', 5): 5.033266273930268},
+ {('no-sub_matrix', 1): 14.859413855165633,
+  ('no-sub_matrix', 2): 34.54519231993284,
+  ('no-sub_matrix', 3): 24.26528519671309,
+  ('no-sub_matrix', 4): 35.42343514121054,
+  ('no-sub_matrix', 5): 55.85308623165151,
+  ('no-sub_no-matrix', 1): 14.859413855165633,
+  ('no-sub_no-matrix', 2): 34.54519231993284,
+  ('no-sub_no-matrix', 3): 24.26528519671309,
+  ('no-sub_no-matrix', 4): 35.42343514121054,
+  ('no-sub_no-matrix', 5): 2.3309861205259734,
+  ('sub_matrix', 1): 17.053809634549854,
+  ('sub_matrix', 2): 33.66637542056656,
+  ('sub_matrix', 3): 23.26181234829638,
+  ('sub_matrix', 4): 35.61438567264568,
+  ('sub_matrix', 5): 48.48551986050014,
+  ('sub_no-matrix', 1): 17.053809634549854,
+  ('sub_no-matrix', 2): 33.66637542056656,
+  ('sub_no-matrix', 3): 23.26181234829638,
+  ('sub_no-matrix', 4): 35.61438704850689,
+  ('sub_no-matrix', 5): 2.969309360231736},
+ {('no-sub_matrix', 1): 13.708973748402064,
+  ('no-sub_matrix', 2): 31.147590264691182,
+  ('no-sub_matrix', 3): 30.495597241955565,
+  ('no-sub_matrix', 4): 34.65164493728535,
+  ('no-sub_matrix', 5): 35.87510990950117,
+  ('no-sub_no-matrix', 1): 13.708973748402064,
+  ('no-sub_no-matrix', 2): 31.147590264691182,
+  ('no-sub_no-matrix', 3): 30.495597241955565,
+  ('no-sub_no-matrix', 4): 34.65164493728535,
+  ('no-sub_no-matrix', 5): 3.232032121481573,
+  ('sub_matrix', 1): 17.681722076468287,
+  ('sub_matrix', 2): 33.77225997922327,
+  ('sub_matrix', 3): 29.435808932487806,
+  ('sub_matrix', 4): 34.354368969668016,
+  ('sub_matrix', 5): 20.802733205442486,
+  ('sub_no-matrix', 1): 17.681722076468287,
+  ('sub_no-matrix', 2): 33.77225997922327,
+  ('sub_no-matrix', 3): 29.435808932487806,
+  ('sub_no-matrix', 4): 34.354368969668016,
+  ('sub_no-matrix', 5): 3.7902066303710424},
+ {('no-sub_matrix', 1): 15.72185319065555,
+  ('no-sub_matrix', 2): 45.25539814380218,
+  ('no-sub_matrix', 3): 24.94273362957689,
+  ('no-sub_matrix', 4): 40.81704901026569,
+  ('no-sub_matrix', 5): 42.898794519499596,
+  ('no-sub_no-matrix', 1): 15.72185319065555,
+  ('no-sub_no-matrix', 2): 45.25539814380218,
+  ('no-sub_no-matrix', 3): 24.94273362957689,
+  ('no-sub_no-matrix', 4): 40.81704901026569,
+  ('no-sub_no-matrix', 5): 2.6826901255924644,
+  ('sub_matrix', 1): 17.565795106862403,
+  ('sub_matrix', 2): 46.9371803702329,
+  ('sub_matrix', 3): 23.887805807796486,
+  ('sub_matrix', 4): 39.058599411828766,
+  ('sub_matrix', 5): 32.234453544910295,
+  ('sub_no-matrix', 1): 17.565795106862403,
+  ('sub_no-matrix', 2): 46.9371803702329,
+  ('sub_no-matrix', 3): 23.887805807796486,
+  ('sub_no-matrix', 4): 39.058599411828766,
+  ('sub_no-matrix', 5): 4.214674259243127},
+ {('no-sub_matrix', 1): 13.910878628792588,
+  ('no-sub_matrix', 2): 33.45626834359109,
+  ('no-sub_matrix', 3): 16.127584513594687,
+  ('no-sub_matrix', 4): 32.59623120264939,
+  ('no-sub_matrix', 5): 29.87568851789407,
+  ('no-sub_no-matrix', 1): 13.910878628792588,
+  ('no-sub_no-matrix', 2): 33.45626834359109,
+  ('no-sub_no-matrix', 3): 16.127584513594687,
+  ('no-sub_no-matrix', 4): 32.59623120264939,
+  ('no-sub_no-matrix', 5): 2.3891779982892625,
+  ('sub_matrix', 1): 17.18981661053988,
+  ('sub_matrix', 2): 36.38883326650068,
+  ('sub_matrix', 3): 13.081088737716442,
+  ('sub_matrix', 4): 33.419732612590224,
+  ('sub_matrix', 5): 22.665485632721676,
+  ('sub_no-matrix', 1): 17.18981661053988,
+  ('sub_no-matrix', 2): 36.38883326650068,
+  ('sub_no-matrix', 3): 13.081088737716442,
+  ('sub_no-matrix', 4): 33.419732612590224,
+  ('sub_no-matrix', 5): 6.155199912348024},
+ {('no-sub_matrix', 1): 18.196771699177763,
+  ('no-sub_matrix', 2): 35.624058750852136,
+  ('no-sub_matrix', 3): 23.746554392851053,
+  ('no-sub_matrix', 4): 29.44669921790574,
+  ('no-sub_matrix', 5): 39.72412918901379,
+  ('no-sub_no-matrix', 1): 18.196771699177763,
+  ('no-sub_no-matrix', 2): 35.624058750852136,
+  ('no-sub_no-matrix', 3): 23.746554392851053,
+  ('no-sub_no-matrix', 4): 29.44669921790574,
+  ('no-sub_no-matrix', 5): 2.870123353843486,
+  ('sub_matrix', 1): 20.38619930823735,
+  ('sub_matrix', 2): 36.29781144853154,
+  ('sub_matrix', 3): 22.13637404741934,
+  ('sub_matrix', 4): 29.68729899086184,
+  ('sub_matrix', 5): 36.993790238103884,
+  ('sub_no-matrix', 1): 20.38619930823735,
+  ('sub_no-matrix', 2): 36.29781144853154,
+  ('sub_no-matrix', 3): 22.13637404741934,
+  ('sub_no-matrix', 4): 29.68729899086184,
+  ('sub_no-matrix', 5): 7.650303570399713},
+ {('no-sub_matrix', 1): 11.992867568477442,
+  ('no-sub_matrix', 2): 26.44083030170154,
+  ('no-sub_matrix', 3): 27.574921221726136,
+  ('no-sub_matrix', 4): 28.94213565689118,
+  ('no-sub_matrix', 5): 46.973469397495556,
+  ('no-sub_no-matrix', 1): 11.992867568477442,
+  ('no-sub_no-matrix', 2): 26.44083030170154,
+  ('no-sub_no-matrix', 3): 27.574921221726136,
+  ('no-sub_no-matrix', 4): 28.94213565689118,
+  ('no-sub_no-matrix', 5): 3.354326576753004,
+  ('sub_matrix', 1): 14.434047100994839,
+  ('sub_matrix', 2): 26.76571524620116,
+  ('sub_matrix', 3): 25.83488399989926,
+  ('sub_matrix', 4): 30.263621195061678,
+  ('sub_matrix', 5): 36.822532494114455,
+  ('sub_no-matrix', 1): 14.434047100994839,
+  ('sub_no-matrix', 2): 26.76571524620116,
+  ('sub_no-matrix', 3): 25.83488399989926,
+  ('sub_no-matrix', 4): 30.263621195061678,
+  ('sub_no-matrix', 5): 6.748976893757906},
+ {('no-sub_matrix', 1): 16.27614914680276,
+  ('no-sub_matrix', 2): 41.35282905624703,
+  ('no-sub_matrix', 3): 25.173115913245226,
+  ('no-sub_matrix', 4): 52.876981987369014,
+  ('no-sub_matrix', 5): 49.49767321075167,
+  ('no-sub_no-matrix', 1): 16.27614914680276,
+  ('no-sub_no-matrix', 2): 41.35282905624703,
+  ('no-sub_no-matrix', 3): 25.173115913245226,
+  ('no-sub_no-matrix', 4): 52.876981987369014,
+  ('no-sub_no-matrix', 5): 1.5962803636236758,
+  ('sub_matrix', 1): 18.735912436641787,
+  ('sub_matrix', 2): 43.36213985849511,
+  ('sub_matrix', 3): 24.582800598631913,
+  ('sub_matrix', 4): 53.1616607417586,
+  ('sub_matrix', 5): 41.2664433745972,
+  ('sub_no-matrix', 1): 18.735912436641787,
+  ('sub_no-matrix', 2): 43.36213985849511,
+  ('sub_no-matrix', 3): 24.582800598631913,
+  ('sub_no-matrix', 4): 53.16165799003619,
+  ('sub_no-matrix', 5): 6.4917878462822305},
+ {('no-sub_matrix', 1): 14.036280122634507,
+  ('no-sub_matrix', 2): 53.72802368862095,
+  ('no-sub_matrix', 3): 18.940766131564004,
+  ('no-sub_matrix', 4): 40.74964840745327,
+  ('no-sub_matrix', 5): 39.57008490907742,
+  ('no-sub_no-matrix', 1): 14.036280122634507,
+  ('no-sub_no-matrix', 2): 53.72802368862095,
+  ('no-sub_no-matrix', 3): 18.940766131564004,
+  ('no-sub_no-matrix', 4): 40.74964840745327,
+  ('no-sub_no-matrix', 5): 2.1275557540222967,
+  ('sub_matrix', 1): 19.641722357026286,
+  ('sub_matrix', 2): 52.709120728751486,
+  ('sub_matrix', 3): 17.976257844509426,
+  ('sub_matrix', 4): 42.51851542500959,
+  ('sub_matrix', 5): 28.25018664655579,
+  ('sub_no-matrix', 1): 19.641722357026286,
+  ('sub_no-matrix', 2): 52.709120728751486,
+  ('sub_no-matrix', 3): 17.976257844509426,
+  ('sub_no-matrix', 4): 42.51851267328718,
+  ('sub_no-matrix', 5): 5.409622788119386},
+ {('no-sub_matrix', 1): 16.961927903326398,
+  ('no-sub_matrix', 2): 38.5455951142925,
+  ('no-sub_matrix', 3): 25.122316709729276,
+  ('no-sub_matrix', 4): 35.90131439006518,
+  ('no-sub_matrix', 5): 41.65886977570029,
+  ('no-sub_no-matrix', 1): 16.961927903326398,
+  ('no-sub_no-matrix', 2): 38.5455951142925,
+  ('no-sub_no-matrix', 3): 25.122316709729276,
+  ('no-sub_no-matrix', 4): 35.90131439006518,
+  ('no-sub_no-matrix', 5): 3.2679255886472447,
+  ('sub_matrix', 1): 20.247934372024154,
+  ('sub_matrix', 2): 40.408716019775625,
+  ('sub_matrix', 3): 23.782735071043668,
+  ('sub_matrix', 4): 37.00513584758997,
+  ('sub_matrix', 5): 29.22700479607527,
+  ('sub_no-matrix', 1): 20.247934372024154,
+  ('sub_no-matrix', 2): 40.408716019775625,
+  ('sub_no-matrix', 3): 23.782735071043668,
+  ('sub_no-matrix', 4): 37.00513584758997,
+  ('sub_no-matrix', 5): 4.780011845541033},
+ {('no-sub_matrix', 1): 12.109815771064152,
+  ('no-sub_matrix', 2): 38.32406752938649,
+  ('no-sub_matrix', 3): 25.987801084044044,
+  ('no-sub_matrix', 4): 40.40950903177875,
+  ('no-sub_matrix', 5): 52.86522525335603,
+  ('no-sub_no-matrix', 1): 12.109815771064152,
+  ('no-sub_no-matrix', 2): 38.32406752938649,
+  ('no-sub_no-matrix', 3): 25.987801084044044,
+  ('no-sub_no-matrix', 4): 40.40950903177875,
+  ('no-sub_no-matrix', 5): 3.61917194787979,
+  ('sub_matrix', 1): 15.130341564722832,
+  ('sub_matrix', 2): 37.89719334728088,
+  ('sub_matrix', 3): 24.65681032273433,
+  ('sub_matrix', 4): 40.731610867030774,
+  ('sub_matrix', 5): 37.566910985257906,
+  ('sub_no-matrix', 1): 15.130341564722832,
+  ('sub_no-matrix', 2): 37.89719334728088,
+  ('sub_no-matrix', 3): 24.65681032273433,
+  ('sub_no-matrix', 4): 40.731610867030774,
+  ('sub_no-matrix', 5): 9.39736249989602},
+ {('no-sub_matrix', 1): 16.25058564557851,
+  ('no-sub_matrix', 2): 37.20405682898803,
+  ('no-sub_matrix', 3): 30.5107090995129,
+  ('no-sub_matrix', 4): 44.537084655292894,
+  ('no-sub_matrix', 5): 46.50046620075818,
+  ('no-sub_no-matrix', 1): 16.25058564557851,
+  ('no-sub_no-matrix', 2): 37.20405682898803,
+  ('no-sub_no-matrix', 3): 30.5107090995129,
+  ('no-sub_no-matrix', 4): 44.537084655292894,
+  ('no-sub_no-matrix', 5): 1.8752506698658238,
+  ('sub_matrix', 1): 18.440281483079957,
+  ('sub_matrix', 2): 38.54769605435544,
+  ('sub_matrix', 3): 30.510800250317864,
+  ('sub_matrix', 4): 44.99740645329493,
+  ('sub_matrix', 5): 39.55738177603457,
+  ('sub_no-matrix', 1): 18.440281483079957,
+  ('sub_no-matrix', 2): 38.54769605435544,
+  ('sub_no-matrix', 3): 30.510800250317864,
+  ('sub_no-matrix', 4): 44.99740645329493,
+  ('sub_no-matrix', 5): 2.6233048602148386},
+ {('no-sub_matrix', 1): 16.324447378609865,
+  ('no-sub_matrix', 2): 30.87308462806543,
+  ('no-sub_matrix', 3): 22.765564836381643,
+  ('no-sub_matrix', 4): 38.337445027901204,
+  ('no-sub_matrix', 5): 40.98815076599078,
+  ('no-sub_no-matrix', 1): 16.324447378609865,
+  ('no-sub_no-matrix', 2): 30.87308462806543,
+  ('no-sub_no-matrix', 3): 22.765564836381643,
+  ('no-sub_no-matrix', 4): 38.337445027901204,
+  ('no-sub_no-matrix', 5): 1.4796406979126138,
+  ('sub_matrix', 1): 17.9623592385626,
+  ('sub_matrix', 2): 32.36568198294609,
+  ('sub_matrix', 3): 22.438215466486483,
+  ('sub_matrix', 4): 40.900713840387546,
+  ('sub_matrix', 5): 33.396627340011634,
+  ('sub_no-matrix', 1): 17.9623592385626,
+  ('sub_no-matrix', 2): 32.36568198294609,
+  ('sub_no-matrix', 3): 22.438215466486483,
+  ('sub_no-matrix', 4): 40.900713840387546,
+  ('sub_no-matrix', 5): 6.609518913895668},
+ {('no-sub_matrix', 1): 14.033258731424148,
+  ('no-sub_matrix', 2): 28.37206528002418,
+  ('no-sub_matrix', 3): 27.043658386061033,
+  ('no-sub_matrix', 4): 36.167049513436204,
+  ('no-sub_matrix', 5): 52.280797076864395,
+  ('no-sub_no-matrix', 1): 14.033258731424148,
+  ('no-sub_no-matrix', 2): 28.37206528002418,
+  ('no-sub_no-matrix', 3): 27.043658386061033,
+  ('no-sub_no-matrix', 4): 36.167049513436204,
+  ('no-sub_no-matrix', 5): 1.9358795417918389,
+  ('sub_matrix', 1): 16.606623097498794,
+  ('sub_matrix', 2): 29.98729916366884,
+  ('sub_matrix', 3): 24.737985875967603,
+  ('sub_matrix', 4): 34.93154214402433,
+  ('sub_matrix', 5): 42.35241303296243,
+  ('sub_no-matrix', 1): 16.606623097498794,
+  ('sub_no-matrix', 2): 29.98729916366884,
+  ('sub_no-matrix', 3): 24.737985875967603,
+  ('sub_no-matrix', 4): 34.931551775052775,
+  ('sub_no-matrix', 5): 7.151971456773863},
+ {('no-sub_matrix', 1): 10.482293039084738,
+  ('no-sub_matrix', 2): 52.67861788579445,
+  ('no-sub_matrix', 3): 21.665543335527666,
+  ('no-sub_matrix', 4): 23.53727708917033,
+  ('no-sub_matrix', 5): 32.2645584918966,
+  ('no-sub_no-matrix', 1): 10.482293039084738,
+  ('no-sub_no-matrix', 2): 52.67861788579445,
+  ('no-sub_no-matrix', 3): 21.665543335527666,
+  ('no-sub_no-matrix', 4): 23.53727708917033,
+  ('no-sub_no-matrix', 5): 2.5207572809328243,
+  ('sub_matrix', 1): 11.523882918360123,
+  ('sub_matrix', 2): 57.336257883871156,
+  ('sub_matrix', 3): 21.647716645835132,
+  ('sub_matrix', 4): 23.491483569694733,
+  ('sub_matrix', 5): 24.264706351480406,
+  ('sub_no-matrix', 1): 11.523882918360123,
+  ('sub_no-matrix', 2): 57.336257883871156,
+  ('sub_no-matrix', 3): 21.647716645835132,
+  ('sub_no-matrix', 4): 23.491462243846026,
+  ('sub_no-matrix', 5): 9.714244661694366},
+ {('no-sub_matrix', 1): 11.992867568477442,
+  ('no-sub_matrix', 2): 28.861638231250264,
+  ('no-sub_matrix', 3): 24.222607873884137,
+  ('no-sub_matrix', 4): 41.28280460012173,
+  ('no-sub_matrix', 5): 56.6084264455065,
+  ('no-sub_no-matrix', 1): 11.992867568477442,
+  ('no-sub_no-matrix', 2): 28.861638231250264,
+  ('no-sub_no-matrix', 3): 24.222607873884137,
+  ('no-sub_no-matrix', 4): 41.28280460012173,
+  ('no-sub_no-matrix', 5): 2.4980576348107437,
+  ('sub_matrix', 1): 14.531057698832324,
+  ('sub_matrix', 2): 31.280393934821902,
+  ('sub_matrix', 3): 20.756528260470358,
+  ('sub_matrix', 4): 42.15937712589425,
+  ('sub_matrix', 5): 52.45767194621365,
+  ('sub_no-matrix', 1): 14.531057698832324,
+  ('sub_no-matrix', 2): 31.280393934821902,
+  ('sub_no-matrix', 3): 20.756528260470358,
+  ('sub_no-matrix', 4): 42.15937712589425,
+  ('sub_no-matrix', 5): 4.819862633503057}]
+def test_gpt_subordination_region_totals():
+    """
+    Check region-level surprisals against the original syntaxgym-core
+    implementation, using the same underlying `gpt2` model.
+    """
+    reference = ...  # TODO
+    # TODO work out references
+    dataset = datasets.load_dataset("cpllab/syntaxgym", "subordination_src-src")
+    metric = evaluate.load("./syntaxgym.py")
+    result = metric.compute(suite=dataset["test"], model_id="gpt2")
+    from pprint import pprint
+    pprint(result["region_totals"][0])
+    pprint(GPT2_SUBORDINATION_SRC_REFERENCE[0])
+    keys = result["region_totals"][0].keys()
+    assert set(keys) == set(GPT2_SUBORDINATION_SRC_REFERENCE[0].keys())
+    result_ndarray = np.concatenate([np.array([region_totals[key] for key in keys])
+                                     for region_totals in result["region_totals"]])
+    reference_ndarray = np.concatenate([np.array([region_totals[key] for key in keys])
+                                        for region_totals in GPT2_SUBORDINATION_SRC_REFERENCE])
+    pprint(sorted(zip(keys, np.abs(result_ndarray - reference_ndarray)),
+                  key=lambda x: -x[1]))
+    np.testing.assert_allclose(result_ndarray, reference_ndarray, atol=1e-3)