Spaces:
Sleeping
Sleeping
interface change: use namedtuple for metric result so that we can add an `accuracy` property and have dot-access
Browse files- README.md +3 -3
- syntaxgym.py +8 -5
- test.py +1 -1
README.md
CHANGED
@@ -35,7 +35,7 @@ result = metric.compute(dataset=dataset["test"], model_id="gpt2")
|
|
35 |
|
36 |
# Compute suite accuracy. Mean success over items, where "success" is the conjunction
|
37 |
# of all boolean prediction results.
|
38 |
-
suite_accuracy =
|
39 |
```
|
40 |
|
41 |
### Run the entire SyntaxGym dataset
|
@@ -53,7 +53,7 @@ result = metric.compute(dataset=dataset["test"], model_id="gpt2")
|
|
53 |
|
54 |
# Compute suite accuracy. Mean success over items, where "success" is the conjunction
|
55 |
# of all boolean prediction results.
|
56 |
-
suite_accuracies = {suite_name:
|
57 |
for suite_name, suite_results in result.items()}
|
58 |
overall_accuracy = np.mean(list(suite_accuracies.values()))
|
59 |
```
|
@@ -105,7 +105,7 @@ overall_accuracy = np.mean(list(suite_accuracies.values()))
|
|
105 |
|
106 |
### Output Values
|
107 |
|
108 |
-
The metric returns a dict of
|
109 |
|
110 |
- **prediction_results** (`List[List[bool]]`): For each item in the test suite, a list of booleans indicating whether each corresponding prediction came out `True`. Typically these are combined to yield an accuracy score (see example usage above).
|
111 |
- **region_totals** (`List[Dict[Tuple[str, int], float]`): For each item, a mapping from individual region (keys `(<condition_name>, <region_number>)`) to the float-valued total surprisal for tokens in this region. This is useful for visualization, or if you'd like to use the aggregate surprisal data for other tasks (e.g. reading time prediction or neural activity prediction).
|
|
|
35 |
|
36 |
# Compute suite accuracy. Mean success over items, where "success" is the conjunction
|
37 |
# of all boolean prediction results.
|
38 |
+
suite_accuracy = result["subordination_src-src"].accuracy
|
39 |
```
|
40 |
|
41 |
### Run the entire SyntaxGym dataset
|
|
|
53 |
|
54 |
# Compute suite accuracy. Mean success over items, where "success" is the conjunction
|
55 |
# of all boolean prediction results.
|
56 |
+
suite_accuracies = {suite_name: suite_results.accuracy
|
57 |
for suite_name, suite_results in result.items()}
|
58 |
overall_accuracy = np.mean(list(suite_accuracies.values()))
|
59 |
```
|
|
|
105 |
|
106 |
### Output Values
|
107 |
|
108 |
+
The metric returns a dict of `SyntaxGymMetricSuiteResult` tuples, mapping test suite names to test suite performance. Each inner dict has two entries:
|
109 |
|
110 |
- **prediction_results** (`List[List[bool]]`): For each item in the test suite, a list of booleans indicating whether each corresponding prediction came out `True`. Typically these are combined to yield an accuracy score (see example usage above).
|
111 |
- **region_totals** (`List[Dict[Tuple[str, int], float]`): For each item, a mapping from individual region (keys `(<condition_name>, <region_number>)`) to the float-valued total surprisal for tokens in this region. This is useful for visualization, or if you'd like to use the aggregate surprisal data for other tasks (e.g. reading time prediction or neural activity prediction).
|
syntaxgym.py
CHANGED
@@ -14,8 +14,7 @@
|
|
14 |
"""TODO: Add a description here."""
|
15 |
|
16 |
from collections import defaultdict
|
17 |
-
from typing import List, Dict, Tuple
|
18 |
-
from typing_extensions import TypedDict
|
19 |
|
20 |
import datasets
|
21 |
import evaluate
|
@@ -85,7 +84,7 @@ SUITE_DATASET_SPEC = {
|
|
85 |
}
|
86 |
|
87 |
|
88 |
-
class SyntaxGymMetricSuiteResult(
|
89 |
"""
|
90 |
Evaluation results for a single suite.
|
91 |
"""
|
@@ -93,6 +92,10 @@ class SyntaxGymMetricSuiteResult(TypedDict):
|
|
93 |
prediction_results: List[List[bool]]
|
94 |
region_totals: List[Dict[Tuple[str, int], float]]
|
95 |
|
|
|
|
|
|
|
|
|
96 |
|
97 |
SyntaxGymMetricResult = Dict[str, SyntaxGymMetricSuiteResult]
|
98 |
|
@@ -180,9 +183,9 @@ class SyntaxGym(evaluate.EvaluationModule):
|
|
180 |
|
181 |
suite_name = item["suite_name"]
|
182 |
if suite_name not in results:
|
183 |
-
results[suite_name] =
|
184 |
for k in result_keys:
|
185 |
-
results[suite_name]
|
186 |
|
187 |
return results
|
188 |
|
|
|
14 |
"""TODO: Add a description here."""
|
15 |
|
16 |
from collections import defaultdict
|
17 |
+
from typing import List, Dict, Tuple, NamedTuple
|
|
|
18 |
|
19 |
import datasets
|
20 |
import evaluate
|
|
|
84 |
}
|
85 |
|
86 |
|
87 |
+
class SyntaxGymMetricSuiteResult(NamedTuple):
|
88 |
"""
|
89 |
Evaluation results for a single suite.
|
90 |
"""
|
|
|
92 |
prediction_results: List[List[bool]]
|
93 |
region_totals: List[Dict[Tuple[str, int], float]]
|
94 |
|
95 |
+
@property
|
96 |
+
def accuracy(self) -> float:
|
97 |
+
return np.array(self.prediction_results).all(axis=1).mean(axis=0)
|
98 |
+
|
99 |
|
100 |
SyntaxGymMetricResult = Dict[str, SyntaxGymMetricSuiteResult]
|
101 |
|
|
|
183 |
|
184 |
suite_name = item["suite_name"]
|
185 |
if suite_name not in results:
|
186 |
+
results[suite_name] = SyntaxGymMetricSuiteResult(suite_name, [], [])
|
187 |
for k in result_keys:
|
188 |
+
getattr(results[suite_name], k).append(result_single[k])
|
189 |
|
190 |
return results
|
191 |
|
test.py
CHANGED
@@ -499,7 +499,7 @@ def test_gpt_subordination_region_totals(syntaxgym_metric):
|
|
499 |
dataset = datasets.load_dataset("cpllab/syntaxgym", suite_name)
|
500 |
result = syntaxgym_metric.compute(dataset=dataset["test"], model_id="gpt2")
|
501 |
|
502 |
-
region_totals = result[suite_name]
|
503 |
from pprint import pprint
|
504 |
pprint(region_totals[0])
|
505 |
pprint(GPT2_SUBORDINATION_SRC_REFERENCE[0])
|
|
|
499 |
dataset = datasets.load_dataset("cpllab/syntaxgym", suite_name)
|
500 |
result = syntaxgym_metric.compute(dataset=dataset["test"], model_id="gpt2")
|
501 |
|
502 |
+
region_totals = result[suite_name].region_totals
|
503 |
from pprint import pprint
|
504 |
pprint(region_totals[0])
|
505 |
pprint(GPT2_SUBORDINATION_SRC_REFERENCE[0])
|