jgauthier commited on
Commit
e00b8f2
·
1 Parent(s): c70b8db

interface change: use namedtuple for metric result so that we can add an `accuracy` property and have dot-access

Browse files
Files changed (3) hide show
  1. README.md +3 -3
  2. syntaxgym.py +8 -5
  3. test.py +1 -1
README.md CHANGED
@@ -35,7 +35,7 @@ result = metric.compute(dataset=dataset["test"], model_id="gpt2")
35
 
36
  # Compute suite accuracy. Mean success over items, where "success" is the conjunction
37
  # of all boolean prediction results.
38
- suite_accuracy = np.array(result["subordination_src-src"]["prediction_results"]).all(axis=1).mean(axis=0)
39
  ```
40
 
41
  ### Run the entire SyntaxGym dataset
@@ -53,7 +53,7 @@ result = metric.compute(dataset=dataset["test"], model_id="gpt2")
53
 
54
  # Compute suite accuracy. Mean success over items, where "success" is the conjunction
55
  # of all boolean prediction results.
56
- suite_accuracies = {suite_name: np.array(suite_results["prediction_results"]).all(axis=1).mean(axis=0)
57
  for suite_name, suite_results in result.items()}
58
  overall_accuracy = np.mean(list(suite_accuracies.values()))
59
  ```
@@ -105,7 +105,7 @@ overall_accuracy = np.mean(list(suite_accuracies.values()))
105
 
106
  ### Output Values
107
 
108
- The metric returns a dict of dicts, mapping test suite names to test suite performance. Each inner dict has two entries:
109
 
110
  - **prediction_results** (`List[List[bool]]`): For each item in the test suite, a list of booleans indicating whether each corresponding prediction came out `True`. Typically these are combined to yield an accuracy score (see example usage above).
111
  - **region_totals** (`List[Dict[Tuple[str, int], float]`): For each item, a mapping from individual region (keys `(<condition_name>, <region_number>)`) to the float-valued total surprisal for tokens in this region. This is useful for visualization, or if you'd like to use the aggregate surprisal data for other tasks (e.g. reading time prediction or neural activity prediction).
 
35
 
36
  # Compute suite accuracy. Mean success over items, where "success" is the conjunction
37
  # of all boolean prediction results.
38
+ suite_accuracy = result["subordination_src-src"].accuracy
39
  ```
40
 
41
  ### Run the entire SyntaxGym dataset
 
53
 
54
  # Compute suite accuracy. Mean success over items, where "success" is the conjunction
55
  # of all boolean prediction results.
56
+ suite_accuracies = {suite_name: suite_results.accuracy
57
  for suite_name, suite_results in result.items()}
58
  overall_accuracy = np.mean(list(suite_accuracies.values()))
59
  ```
 
105
 
106
  ### Output Values
107
 
108
+ The metric returns a dict of `SyntaxGymMetricSuiteResult` tuples, mapping test suite names to test suite performance. Each inner dict has two entries:
109
 
110
  - **prediction_results** (`List[List[bool]]`): For each item in the test suite, a list of booleans indicating whether each corresponding prediction came out `True`. Typically these are combined to yield an accuracy score (see example usage above).
111
  - **region_totals** (`List[Dict[Tuple[str, int], float]`): For each item, a mapping from individual region (keys `(<condition_name>, <region_number>)`) to the float-valued total surprisal for tokens in this region. This is useful for visualization, or if you'd like to use the aggregate surprisal data for other tasks (e.g. reading time prediction or neural activity prediction).
syntaxgym.py CHANGED
@@ -14,8 +14,7 @@
14
  """TODO: Add a description here."""
15
 
16
  from collections import defaultdict
17
- from typing import List, Dict, Tuple
18
- from typing_extensions import TypedDict
19
 
20
  import datasets
21
  import evaluate
@@ -85,7 +84,7 @@ SUITE_DATASET_SPEC = {
85
  }
86
 
87
 
88
- class SyntaxGymMetricSuiteResult(TypedDict):
89
  """
90
  Evaluation results for a single suite.
91
  """
@@ -93,6 +92,10 @@ class SyntaxGymMetricSuiteResult(TypedDict):
93
  prediction_results: List[List[bool]]
94
  region_totals: List[Dict[Tuple[str, int], float]]
95
 
 
 
 
 
96
 
97
  SyntaxGymMetricResult = Dict[str, SyntaxGymMetricSuiteResult]
98
 
@@ -180,9 +183,9 @@ class SyntaxGym(evaluate.EvaluationModule):
180
 
181
  suite_name = item["suite_name"]
182
  if suite_name not in results:
183
- results[suite_name] = {k: [] for k in result_keys}
184
  for k in result_keys:
185
- results[suite_name][k].append(result_single[k])
186
 
187
  return results
188
 
 
14
  """TODO: Add a description here."""
15
 
16
  from collections import defaultdict
17
+ from typing import List, Dict, Tuple, NamedTuple
 
18
 
19
  import datasets
20
  import evaluate
 
84
  }
85
 
86
 
87
+ class SyntaxGymMetricSuiteResult(NamedTuple):
88
  """
89
  Evaluation results for a single suite.
90
  """
 
92
  prediction_results: List[List[bool]]
93
  region_totals: List[Dict[Tuple[str, int], float]]
94
 
95
+ @property
96
+ def accuracy(self) -> float:
97
+ return np.array(self.prediction_results).all(axis=1).mean(axis=0)
98
+
99
 
100
  SyntaxGymMetricResult = Dict[str, SyntaxGymMetricSuiteResult]
101
 
 
183
 
184
  suite_name = item["suite_name"]
185
  if suite_name not in results:
186
+ results[suite_name] = SyntaxGymMetricSuiteResult(suite_name, [], [])
187
  for k in result_keys:
188
+ getattr(results[suite_name], k).append(result_single[k])
189
 
190
  return results
191
 
test.py CHANGED
@@ -499,7 +499,7 @@ def test_gpt_subordination_region_totals(syntaxgym_metric):
499
  dataset = datasets.load_dataset("cpllab/syntaxgym", suite_name)
500
  result = syntaxgym_metric.compute(dataset=dataset["test"], model_id="gpt2")
501
 
502
- region_totals = result[suite_name]["region_totals"]
503
  from pprint import pprint
504
  pprint(region_totals[0])
505
  pprint(GPT2_SUBORDINATION_SRC_REFERENCE[0])
 
499
  dataset = datasets.load_dataset("cpllab/syntaxgym", suite_name)
500
  result = syntaxgym_metric.compute(dataset=dataset["test"], model_id="gpt2")
501
 
502
+ region_totals = result[suite_name].region_totals
503
  from pprint import pprint
504
  pprint(region_totals[0])
505
  pprint(GPT2_SUBORDINATION_SRC_REFERENCE[0])