Spaces:

cpllab
/

syntaxgym

Sleeping

App Files Files Community

jgauthier commited on Jul 8, 2022

Commit

e00b8f2

1 Parent(s): c70b8db

interface change: use namedtuple for metric result so that we can add an `accuracy` property and have dot-access

Browse files

Files changed (3) hide show

README.md +3 -3
syntaxgym.py +8 -5
test.py +1 -1

README.md CHANGED Viewed

@@ -35,7 +35,7 @@ result = metric.compute(dataset=dataset["test"], model_id="gpt2")
 # Compute suite accuracy. Mean success over items, where "success" is the conjunction
 # of all boolean prediction results.
-suite_accuracy = np.array(result["subordination_src-src"]["prediction_results"]).all(axis=1).mean(axis=0)
 ```
 ### Run the entire SyntaxGym dataset
@@ -53,7 +53,7 @@ result = metric.compute(dataset=dataset["test"], model_id="gpt2")
 # Compute suite accuracy. Mean success over items, where "success" is the conjunction
 # of all boolean prediction results.
-suite_accuracies = {suite_name: np.array(suite_results["prediction_results"]).all(axis=1).mean(axis=0)
                     for suite_name, suite_results in result.items()}
 overall_accuracy = np.mean(list(suite_accuracies.values()))
 ```
@@ -105,7 +105,7 @@ overall_accuracy = np.mean(list(suite_accuracies.values()))
 ### Output Values
-The metric returns a dict of dicts, mapping test suite names to test suite performance. Each inner dict has two entries:
 - **prediction_results** (`List[List[bool]]`): For each item in the test suite, a list of booleans indicating whether each corresponding prediction came out `True`. Typically these are combined to yield an accuracy score (see example usage above).
 - **region_totals** (`List[Dict[Tuple[str, int], float]`): For each item, a mapping from individual region (keys `(<condition_name>, <region_number>)`) to the float-valued total surprisal for tokens in this region. This is useful for visualization, or if you'd like to use the aggregate surprisal data for other tasks (e.g. reading time prediction or neural activity prediction).

 # Compute suite accuracy. Mean success over items, where "success" is the conjunction
 # of all boolean prediction results.
+suite_accuracy = result["subordination_src-src"].accuracy
 ```
 ### Run the entire SyntaxGym dataset
 # Compute suite accuracy. Mean success over items, where "success" is the conjunction
 # of all boolean prediction results.
+suite_accuracies = {suite_name: suite_results.accuracy
                     for suite_name, suite_results in result.items()}
 overall_accuracy = np.mean(list(suite_accuracies.values()))
 ```
 ### Output Values
+The metric returns a dict of `SyntaxGymMetricSuiteResult` tuples, mapping test suite names to test suite performance. Each inner dict has two entries:
 - **prediction_results** (`List[List[bool]]`): For each item in the test suite, a list of booleans indicating whether each corresponding prediction came out `True`. Typically these are combined to yield an accuracy score (see example usage above).
 - **region_totals** (`List[Dict[Tuple[str, int], float]`): For each item, a mapping from individual region (keys `(<condition_name>, <region_number>)`) to the float-valued total surprisal for tokens in this region. This is useful for visualization, or if you'd like to use the aggregate surprisal data for other tasks (e.g. reading time prediction or neural activity prediction).

syntaxgym.py CHANGED Viewed

@@ -14,8 +14,7 @@
 """TODO: Add a description here."""
 from collections import defaultdict
-from typing import List, Dict, Tuple
-from typing_extensions import TypedDict
 import datasets
 import evaluate
@@ -85,7 +84,7 @@ SUITE_DATASET_SPEC = {
 }
-class SyntaxGymMetricSuiteResult(TypedDict):
     """
     Evaluation results for a single suite.
     """
@@ -93,6 +92,10 @@ class SyntaxGymMetricSuiteResult(TypedDict):
     prediction_results: List[List[bool]]
     region_totals: List[Dict[Tuple[str, int], float]]
 SyntaxGymMetricResult = Dict[str, SyntaxGymMetricSuiteResult]
@@ -180,9 +183,9 @@ class SyntaxGym(evaluate.EvaluationModule):
             suite_name = item["suite_name"]
             if suite_name not in results:
-                results[suite_name] = {k: [] for k in result_keys}
             for k in result_keys:
-                results[suite_name][k].append(result_single[k])
         return results

 """TODO: Add a description here."""
 from collections import defaultdict
+from typing import List, Dict, Tuple, NamedTuple
 import datasets
 import evaluate
 }
+class SyntaxGymMetricSuiteResult(NamedTuple):
     """
     Evaluation results for a single suite.
     """
     prediction_results: List[List[bool]]
     region_totals: List[Dict[Tuple[str, int], float]]
+    @property
+    def accuracy(self) -> float:
+        return np.array(self.prediction_results).all(axis=1).mean(axis=0)
 SyntaxGymMetricResult = Dict[str, SyntaxGymMetricSuiteResult]
             suite_name = item["suite_name"]
             if suite_name not in results:
+                results[suite_name] = SyntaxGymMetricSuiteResult(suite_name, [], [])
             for k in result_keys:
+                getattr(results[suite_name], k).append(result_single[k])
         return results

test.py CHANGED Viewed

@@ -499,7 +499,7 @@ def test_gpt_subordination_region_totals(syntaxgym_metric):
     dataset = datasets.load_dataset("cpllab/syntaxgym", suite_name)
     result = syntaxgym_metric.compute(dataset=dataset["test"], model_id="gpt2")
-    region_totals = result[suite_name]["region_totals"]
     from pprint import pprint
     pprint(region_totals[0])
     pprint(GPT2_SUBORDINATION_SRC_REFERENCE[0])

     dataset = datasets.load_dataset("cpllab/syntaxgym", suite_name)
     result = syntaxgym_metric.compute(dataset=dataset["test"], model_id="gpt2")
+    region_totals = result[suite_name].region_totals
     from pprint import pprint
     pprint(region_totals[0])
     pprint(GPT2_SUBORDINATION_SRC_REFERENCE[0])