Spaces:
Sleeping
Sleeping
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""TODO: Add a description here.""" | |
from collections import defaultdict | |
from typing import List, Dict, Tuple | |
from typing_extensions import TypedDict | |
import datasets | |
import evaluate | |
import numpy as np | |
import torch | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
from .prediction import Prediction | |
_CITATION = """\ | |
@inproceedings{Hu:et-al:2020, | |
author = {Hu, Jennifer and Gauthier, Jon and Qian, Peng and Wilcox, Ethan and Levy, Roger}, | |
title = {A systematic assessment of syntactic generalization in neural language models}, | |
booktitle = {Proceedings of the Association of Computational Linguistics}, | |
year = {2020} | |
} | |
""" | |
# TODO: Add description of the module here | |
_DESCRIPTION = """ | |
""" | |
# TODO: Add description of the arguments of the module here | |
_KWARGS_DESCRIPTION = """ | |
Runs SyntaxGym evaluations on the given model and test suite. | |
Args: | |
suite (Dataset): SyntaxGym test suite loaded as a Dataset. | |
model_id (str): model used for calculating surprisals | |
NOTE: The SyntaxGym evaluations are only well-defined for causal language models. | |
This includes models such as gpt2, causal variations of bert, | |
causal versions of t5, and more (the full list can be found | |
in the AutoModelForCausalLM documentation here: | |
https://huggingface.co/docs/transformers/master/en/model_doc/auto#transformers.AutoModelForCausalLM ) | |
Returns: | |
prediction_results: A list of prediction results per item. A list of lists, | |
one per item, containing the boolean prediction result for each | |
prediction in the test suite, | |
region_totals: A list of total surprisals for each region (nested within | |
condition and item). A list of dictionaries (one per item), each | |
mapping tuples (condition_name, region_number) to a float | |
total surprisal value (i.e. negative log-2 probability). | |
Examples: | |
TODO | |
>>> my_new_module = evaluate.load("cpllab/syntaxgym") | |
>>> ... | |
""" | |
SUITE_DATASET_CONDITION_SPEC = { | |
"condition_name": datasets.Value("string"), | |
"content": datasets.Value("string"), | |
"regions": datasets.Sequence({ | |
"region_number": datasets.Value("int32"), | |
"content": datasets.Value("string") | |
}) | |
} | |
SUITE_DATASET_SPEC = { | |
"item_number": datasets.Value("int32"), | |
"conditions": datasets.Sequence(SUITE_DATASET_CONDITION_SPEC), | |
"predictions": datasets.Sequence(datasets.Value("string")), | |
} | |
class SyntaxGymMetricResult(TypedDict): | |
prediction_results: List[List[bool]] | |
region_totals: List[Dict[Tuple[str, int], float]] | |
class SyntaxGym(evaluate.EvaluationModule): | |
""" | |
Defines SyntaxGym evaluation logic for causal language models. | |
""" | |
def _info(self): | |
seq = datasets.Sequence | |
features = datasets.Features({ | |
"suite": SUITE_DATASET_SPEC | |
}) | |
return evaluate.EvaluationModuleInfo( | |
module_type="metric", | |
description="TODO", | |
citation=_CITATION, | |
inputs_description="TODO", | |
features=features, | |
homepage="https://syntaxgym.org", | |
codebase_urls=["https://github.com/cpllab/syntaxgym-core"], | |
) | |
def _compute(self, suite, model_id, device=None) -> SyntaxGymMetricResult: | |
if device is not None: | |
assert device in ["gpu", "cpu", "cuda"] | |
if device == "gpu": | |
device = "cuda" | |
else: | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model = AutoModelForCausalLM.from_pretrained(model_id) | |
model = model.to(device) | |
model.eval() | |
tokenizer = AutoTokenizer.from_pretrained(model_id) | |
# TODO copy from perplexity metric | |
tokenizer.pad_token = tokenizer.eos_token | |
results = {"prediction_results": [], "region_totals": []} | |
# TODO batch all items together | |
for item in datasets.logging.tqdm(suite): | |
result_single = self._compute_single(item, tokenizer, model, device) | |
for k in ["prediction_results", "region_totals"]: | |
results[k].append(result_single[k]) | |
return results | |
def _compute_single(self, item, tokenizer, model, device): | |
tokenized = tokenizer(item["conditions"]["content"], | |
padding=True, | |
return_tensors="pt", | |
return_offsets_mapping=True).to(device) | |
# input_ids: B * T | |
input_ids = tokenized["input_ids"] | |
assert input_ids.ndim == 2 | |
# Compute sentence level surprisals. | |
with torch.no_grad(): | |
# Pre-softmax predictive distribution B * T * V | |
logits = model(input_ids).logits | |
surprisals = -logits.log_softmax(dim=2) / np.log(2) | |
# surprisals: B * T * V | |
assert surprisals.ndim == 3 | |
# Get surprisals of expected words. | |
surps_shifted = surprisals[:, :-1, :] | |
expected_ids = input_ids[:, 1:] | |
# TODO: check this logic | |
tt = expected_ids.unsqueeze(2) | |
# reindexed surprisals: B * (T - 1) | |
surprisals = torch.gather(surps_shifted, 2, expected_ids.unsqueeze(2)) \ | |
.squeeze(2) | |
# This is the original, which works but not with multiple axes in expected_ids | |
# surprisals = surps_shifted[range(surps_shifted.shape[0]), expected_ids] | |
# surprisals is now B * (T - 1) | |
#### aggregate | |
condition_names = item["conditions"]["condition_name"] | |
region_totals = {condition_name: defaultdict(float) | |
for condition_name in condition_names} | |
region2tokens = self.compute_region_token_mapping( | |
item, input_ids, tokenized["offset_mapping"]) | |
for i, (i_cond, i_inputs) in enumerate(zip(condition_names, input_ids)): | |
for region_number, region_tokens in region2tokens[i_cond].items(): | |
for token in region_tokens: | |
if token == 0: | |
# surprisal not defined. pass. | |
continue | |
elif token <= surprisals.shape[1]: | |
region_totals[i_cond][region_number] += surprisals[i, token - 1] | |
else: | |
# TODO don't think this is an issue, just should clean | |
# up the aggregation output | |
assert token == surprisals.shape[1], \ | |
"%s %s" % (token, surprisals.shape[1]) | |
region_totals = {(condition_name, region_number): float(total) | |
for condition_name, totals in region_totals.items() | |
for region_number, total in totals.items()} | |
results = { | |
"prediction_results": [ | |
Prediction(i, formula, "sum").formula(region_totals) | |
for i, formula in enumerate(item["predictions"]) | |
], | |
"region_totals": region_totals | |
} | |
return results | |
def get_region_edges(self, item, condition_idx): | |
""" | |
Get left edge of each region as a character index. | |
""" | |
# NB this is coupled with `condition_to_string` logic of course | |
regions = item["conditions"]["regions"][condition_idx] | |
idx = 0 | |
ret = [] | |
for r_idx, region_content in enumerate(regions["content"]): | |
ret.append(idx) | |
region_size = len(region_content) | |
if region_content.strip() != "" and r_idx != 0 and not region_content.startswith(","): | |
# Add joining space | |
region_size += 1 | |
idx += region_size | |
return ret | |
def compute_region_token_mapping(self, item, input_ids: torch.LongTensor, | |
offset_mapping: List[Tuple[int, int]] | |
) -> Dict[str, Dict[int, List[int]]]: | |
# input_ids: B * T | |
# offset_mapping: B * T * 2 | |
# assumes batch is sorted according to item's condition_name order | |
condition_names = item["conditions"]["condition_name"] | |
region2tokens = {cond: defaultdict(list) for cond in condition_names} | |
max_long = torch.iinfo(torch.int64).max | |
input_ids = input_ids.detach() | |
for i_cond, (i_tokens, i_offsets) in enumerate(zip(input_ids, offset_mapping)): | |
region_edges = self.get_region_edges(item, i_cond) | |
t_cursor, r_cursor = 0, 0 | |
while t_cursor < i_tokens.shape[0]: | |
# token = i_tokens[t_cursor] | |
token_char_start, token_char_end = i_offsets[t_cursor] | |
if token_char_start == token_char_end == 0: | |
# This is a padding token. Skip. | |
# TODO what about BOS/EOS? some models incorporate them | |
t_cursor += 1 | |
continue | |
region_start = region_edges[r_cursor] | |
region_end = region_edges[r_cursor + 1] \ | |
if r_cursor + 1 < len(region_edges) else max_long | |
# NB region boundaries are left edges, hence the >= here. | |
if token_char_start >= region_end: | |
r_cursor += 1 | |
continue | |
region2tokens[condition_names[i_cond]][r_cursor + 1].append(t_cursor) | |
t_cursor += 1 | |
return region2tokens | |