Spaces:
Sleeping
Sleeping
Jon Gauthier
fall back to GPT2TokenizerFast for models which don't have a fast tokenizer (like OPT)
0d58633
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""TODO: Add a description here.""" | |
from collections import defaultdict | |
import logging | |
from typing import List, Dict, Tuple, NamedTuple | |
import datasets | |
import evaluate | |
import numpy as np | |
import torch | |
from transformers import AutoTokenizer, AutoModelForCausalLM, \ | |
PreTrainedTokenizer, PreTrainedTokenizerFast, \ | |
GPT2TokenizerFast | |
from .prediction import Prediction | |
L = logging.getLogger(__name__) | |
_CITATION = """\ | |
@inproceedings{Hu:et-al:2020, | |
author = {Hu, Jennifer and Gauthier, Jon and Qian, Peng and Wilcox, Ethan and Levy, Roger}, | |
title = {A systematic assessment of syntactic generalization in neural language models}, | |
booktitle = {Proceedings of the Association of Computational Linguistics}, | |
year = {2020} | |
} | |
""" | |
# TODO: Add description of the module here | |
_DESCRIPTION = """ | |
""" | |
# TODO: Add description of the arguments of the module here | |
_KWARGS_DESCRIPTION = """ | |
Runs SyntaxGym evaluations on the given model and test suite. | |
Args: | |
suite (Dataset): SyntaxGym test suite loaded as a Dataset. | |
model_id (str): model used for calculating surprisals | |
NOTE: The SyntaxGym evaluations are only well-defined for causal language models. | |
This includes models such as gpt2, causal variations of bert, | |
causal versions of t5, and more (the full list can be found | |
in the AutoModelForCausalLM documentation here: | |
https://huggingface.co/docs/transformers/master/en/model_doc/auto#transformers.AutoModelForCausalLM ) | |
Returns: | |
prediction_results: A list of prediction results per item. A list of lists, | |
one per item, containing the boolean prediction result for each | |
prediction in the test suite, | |
region_totals: A list of total surprisals for each region (nested within | |
condition and item). A list of dictionaries (one per item), each | |
mapping tuples (condition_name, region_number) to a float | |
total surprisal value (i.e. negative log-2 probability). | |
Examples: | |
TODO | |
>>> my_new_module = evaluate.load("cpllab/syntaxgym") | |
>>> ... | |
""" | |
SUITE_DATASET_CONDITION_SPEC = { | |
"condition_name": datasets.Value("string"), | |
"content": datasets.Value("string"), | |
"regions": datasets.Sequence({ | |
"region_number": datasets.Value("int32"), | |
"content": datasets.Value("string") | |
}) | |
} | |
SUITE_DATASET_SPEC = { | |
"suite_name": datasets.Value("string"), | |
"item_number": datasets.Value("int32"), | |
"conditions": datasets.Sequence(SUITE_DATASET_CONDITION_SPEC), | |
"predictions": datasets.Sequence(datasets.Value("string")), | |
} | |
class SyntaxGymMetricSuiteResult(NamedTuple): | |
""" | |
Evaluation results for a single suite. | |
""" | |
suite_name: str | |
prediction_results: List[List[bool]] | |
region_totals: List[Dict[Tuple[str, int], float]] | |
def accuracy(self) -> float: | |
return np.array(self.prediction_results).all(axis=1).mean(axis=0) | |
SyntaxGymMetricResult = Dict[str, SyntaxGymMetricSuiteResult] | |
def prepare_tokenizer(model, batch_size, add_start_token=True) -> Tuple[PreTrainedTokenizer, Dict]: | |
""" | |
Load and prepare a tokenizer for SyntaxGym evaluation. | |
Returns: | |
tokenizer: | |
tokenizer_kwargs: suggested kwargs for any tokenizer calls | |
""" | |
tokenizer = AutoTokenizer.from_pretrained(model.name_or_path) | |
if not isinstance(tokenizer, PreTrainedTokenizerFast): | |
# We need a fast tokenizer because these are the only tokenizers that support | |
# return_offsets_mapping. Try to use GPT2 tokenizer -- this is sufficient for | |
# OPT. | |
L.warning(f"The model {model.name_or_path} does not have a fast tokenizer, " | |
f"which is required for this metric. Running with GPT2 tokenizer.") | |
tokenizer = GPT2TokenizerFast.from_pretrained(model.name_or_path) | |
# if batch_size > 1 (which generally leads to padding being required), and | |
# if there is not an already assigned pad_token, assign an existing | |
# special token to also be the padding token | |
if tokenizer.pad_token is None and batch_size > 1: | |
existing_special_tokens = list(tokenizer.special_tokens_map_extended.values()) | |
# check that the model already has at least one special token defined | |
assert ( | |
len(existing_special_tokens) > 0 | |
), "If batch_size > 1, model must have at least one special token to use for padding. Please use a different model or set batch_size=1." | |
# assign one of the special tokens to also be the pad token | |
tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]}) | |
if add_start_token: | |
# leave room for <BOS> token to be added: | |
assert ( | |
tokenizer.bos_token is not None | |
), "Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False" | |
max_tokenized_len = model.config.max_length - 1 | |
else: | |
max_tokenized_len = model.config.max_length | |
tokenizer_kwargs = { | |
"add_special_tokens": False, | |
"padding": True, | |
"max_length": max_tokenized_len | |
} | |
return tokenizer, tokenizer_kwargs | |
class SyntaxGym(evaluate.EvaluationModule): | |
""" | |
Defines SyntaxGym evaluation logic for causal language models. | |
""" | |
def _info(self): | |
seq = datasets.Sequence | |
features = datasets.Features({ | |
"dataset": SUITE_DATASET_SPEC | |
}) | |
return evaluate.EvaluationModuleInfo( | |
module_type="metric", | |
description="TODO", | |
citation=_CITATION, | |
inputs_description="TODO", | |
features=features, | |
homepage="https://syntaxgym.org", | |
codebase_urls=["https://github.com/cpllab/syntaxgym-core"], | |
) | |
def _compute(self, dataset, model_id, batch_size=8, add_start_token=False, device=None) -> SyntaxGymMetricResult: | |
if device is not None: | |
assert device in ["gpu", "cpu", "cuda"] | |
if device == "gpu": | |
device = "cuda" | |
else: | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model = AutoModelForCausalLM.from_pretrained(model_id) | |
model = model.to(device) | |
model.eval() | |
tokenizer, tokenizer_kwargs = prepare_tokenizer(model, batch_size, add_start_token) | |
# Flatten sentences, enforcing that sentences are always ordered by the same condition. | |
condition_order = dataset[0]["conditions"]["condition_name"] | |
all_sentences = [] | |
for item in dataset: | |
for condition_name in condition_order: | |
# Get idx of condition for this item. | |
condition_idx = item["conditions"]["condition_name"].index(condition_name) | |
all_sentences.append(item["conditions"]["content"][condition_idx]) | |
# Tokenize sentences and split into batches. | |
all_tokenized_sentences = tokenizer(all_sentences, return_tensors="pt", | |
return_offsets_mapping=True, | |
**tokenizer_kwargs).to(device) | |
tokenized_batches = torch.split(all_tokenized_sentences["input_ids"], batch_size) | |
# Compute surprisal per-batch and combine into a single surprisal tensor. | |
n_sentences, n_timesteps = all_tokenized_sentences["input_ids"].shape | |
surprisals = torch.zeros(n_sentences, n_timesteps - 1).float().to(device) | |
for i, batch in enumerate(datasets.logging.tqdm(tokenized_batches)) : | |
batch = batch.to(device) | |
with torch.no_grad(): | |
# logits are B * T * V | |
b_logits = model(batch)["logits"] | |
b_surprisals = -b_logits.log_softmax(dim=2) / np.log(2) | |
# Get surprisals of ground-truth words. | |
gt_idxs = batch[:, 1:] | |
# Reindexed surprisals: B * (T - 1) | |
b_surprisals_gt = torch.gather(b_surprisals[:, :-1, :], 2, gt_idxs.unsqueeze(2)).squeeze(2) | |
surprisals[i * batch_size : (i + 1) * batch_size] = b_surprisals_gt | |
# Reshape to intuitive axes n_items * n_conditions * ... | |
surprisals = surprisals.reshape((len(dataset), len(condition_order), -1)) | |
offset_mapping = all_tokenized_sentences["offset_mapping"] \ | |
.reshape((len(dataset), len(condition_order), -1, 2)) | |
# Now evaluate per-item. | |
results = {} | |
result_keys = ["prediction_results", "region_totals"] | |
for item, item_surprisals, item_offset_mapping in zip(datasets.logging.tqdm(dataset), surprisals, offset_mapping): | |
result_i = self._compute_item(item, item_surprisals, item_offset_mapping, condition_order) | |
suite_name = item["suite_name"] | |
if suite_name not in results: | |
results[suite_name] = SyntaxGymMetricSuiteResult(suite_name, [], []) | |
for k in result_keys: | |
getattr(results[suite_name], k).append(result_i[k]) | |
return results | |
def _compute_item(self, item, item_surprisals, offset_mapping, condition_order): | |
""" | |
Aggregate token-level surprisals to region-level surprisals for the given item, | |
and evaluate the item's predictions. | |
""" | |
#### aggregate | |
region_totals = {condition_name: defaultdict(float) | |
for condition_name in condition_order} | |
region2tokens = self.compute_region_token_mapping( | |
item, condition_order, offset_mapping) | |
for i, (cond_i, surprisals_i) in enumerate(zip(condition_order, item_surprisals)): | |
for region_number, region_tokens in region2tokens[cond_i].items(): | |
for token in region_tokens: | |
if token == 0: | |
# surprisal not defined. pass. | |
continue | |
elif token <= item_surprisals.shape[1]: | |
region_totals[cond_i][region_number] += surprisals_i[token - 1] | |
else: | |
# TODO don't think this is an issue, just should clean | |
# up the aggregation output | |
assert token == surprisals_i.shape[1], \ | |
"%s %s" % (token, surprisals_i.shape[1]) | |
region_totals = {(condition_name, region_number): float(total) | |
for condition_name, totals in region_totals.items() | |
for region_number, total in totals.items()} | |
results = { | |
"prediction_results": [ | |
Prediction(i, formula, "sum").formula(region_totals) | |
for i, formula in enumerate(item["predictions"]) | |
], | |
"region_totals": region_totals | |
} | |
return results | |
def get_region_edges(self, item, condition_name): | |
""" | |
Get left edge of each region as a character index. | |
""" | |
# NB this is coupled with `condition_to_string` logic of course | |
condition_idx = item["conditions"]["condition_name"].index(condition_name) | |
regions = item["conditions"]["regions"][condition_idx] | |
idx = 0 | |
ret = [] | |
for r_idx, region_content in enumerate(regions["content"]): | |
ret.append(idx) | |
region_size = len(region_content) | |
# If this is not the first nonspace/nonpunct region, then it will | |
# be preceded by a joining space. | |
if region_content.strip() != "" and idx > 0 and not region_content.startswith(","): | |
# Add joining space | |
region_size += 1 | |
idx += region_size | |
return ret | |
def compute_region_token_mapping(self, item, condition_order, | |
offset_mapping: List[Tuple[int, int]] | |
) -> Dict[str, Dict[int, List[int]]]: | |
# offset_mapping: B * T * 2 | |
region2tokens = {cond: defaultdict(list) for cond in condition_order} | |
max_long = torch.iinfo(torch.int64).max | |
for cond_name, i_offsets in zip(condition_order, offset_mapping): | |
region_edges = self.get_region_edges(item, cond_name) | |
t_cursor, r_cursor = 0, 0 | |
while t_cursor < i_offsets.shape[0]: | |
# token = i_tokens[t_cursor] | |
token_char_start, token_char_end = i_offsets[t_cursor] | |
if token_char_start == token_char_end == 0: | |
# This is a padding token. Skip. | |
# TODO what about BOS/EOS? some models incorporate them | |
t_cursor += 1 | |
continue | |
region_start = region_edges[r_cursor] | |
region_end = region_edges[r_cursor + 1] \ | |
if r_cursor + 1 < len(region_edges) else max_long | |
# NB region boundaries are left edges, hence the >= here. | |
if token_char_start >= region_end: | |
r_cursor += 1 | |
continue | |
region2tokens[cond_name][r_cursor + 1].append(t_cursor) | |
t_cursor += 1 | |
return region2tokens | |