Spaces:

cpllab
/

syntaxgym

Sleeping

App Files Files Community

jgauthier commited on Jul 7, 2022

Commit

4bd2962

1 Parent(s): 3427db2

add tokenizer config from perplexity metric. truncation breaks tests

Browse files

Files changed (1) hide show

syntaxgym.py +49 -11

syntaxgym.py CHANGED Viewed

@@ -21,7 +21,7 @@ import datasets
 import evaluate
 import numpy as np
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
 from .prediction import Prediction
@@ -89,6 +89,46 @@ class SyntaxGymMetricResult(TypedDict):
     region_totals: List[Dict[Tuple[str, int], float]]
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class SyntaxGym(evaluate.EvaluationModule):
     """
@@ -110,7 +150,7 @@ class SyntaxGym(evaluate.EvaluationModule):
             codebase_urls=["https://github.com/cpllab/syntaxgym-core"],
         )
-    def _compute(self, suite, model_id, device=None) -> SyntaxGymMetricResult:
         if device is not None:
             assert device in ["gpu", "cpu", "cuda"]
             if device == "gpu":
@@ -122,31 +162,31 @@ class SyntaxGym(evaluate.EvaluationModule):
         model = model.to(device)
         model.eval()
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        # TODO copy from perplexity metric
-        tokenizer.pad_token = tokenizer.eos_token
         results = {"prediction_results": [], "region_totals": []}
         # TODO batch all items together
         for item in datasets.logging.tqdm(suite):
-            result_single = self._compute_single(item, tokenizer, model, device)
             for k in ["prediction_results", "region_totals"]:
                 results[k].append(result_single[k])
         return results
-    def _compute_single(self, item, tokenizer, model, device):
         tokenized = tokenizer(item["conditions"]["content"],
-                              padding=True,
                               return_tensors="pt",
-                              return_offsets_mapping=True).to(device)
         # input_ids: B * T
         input_ids = tokenized["input_ids"]
         assert input_ids.ndim == 2
         # Compute sentence level surprisals.
         with torch.no_grad():
             # Pre-softmax predictive distribution B * T * V
             logits = model(input_ids).logits
@@ -164,8 +204,6 @@ class SyntaxGym(evaluate.EvaluationModule):
         # reindexed surprisals: B * (T - 1)
         surprisals = torch.gather(surps_shifted, 2, expected_ids.unsqueeze(2)) \
             .squeeze(2)
-        # This is the original, which works but not with multiple axes in expected_ids
-        # surprisals = surps_shifted[range(surps_shifted.shape[0]), expected_ids]
         # surprisals is now B * (T - 1)

 import evaluate
 import numpy as np
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizer
 from .prediction import Prediction
     region_totals: List[Dict[Tuple[str, int], float]]
+def prepare_tokenizer(model, batch_size, add_start_token=True) -> Tuple[PreTrainedTokenizer, Dict]:
+    """
+    Load and prepare a tokenizer for SyntaxGym evaluation.
+    Returns:
+        tokenizer:
+        tokenizer_kwargs: suggested kwargs for any tokenizer calls
+    """
+    tokenizer = AutoTokenizer.from_pretrained(model.name_or_path)
+    # if batch_size > 1 (which generally leads to padding being required), and
+    # if there is not an already assigned pad_token, assign an existing
+    # special token to also be the padding token
+    if tokenizer.pad_token is None and batch_size > 1:
+        existing_special_tokens = list(tokenizer.special_tokens_map_extended.values())
+        # check that the model already has at least one special token defined
+        assert (
+            len(existing_special_tokens) > 0
+        ), "If batch_size > 1, model must have at least one special token to use for padding. Please use a different model or set batch_size=1."
+        # assign one of the special tokens to also be the pad token
+        tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})
+    if add_start_token:
+        # leave room for <BOS> token to be added:
+        assert (
+            tokenizer.bos_token is not None
+        ), "Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False"
+        max_tokenized_len = model.config.max_length - 1
+    else:
+        max_tokenized_len = model.config.max_length
+    tokenizer_kwargs = {
+        "add_special_tokens": False,
+        "padding": True,
+        "truncation": True,
+        "max_length": max_tokenized_len
+    }
+    return tokenizer, tokenizer_kwargs
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class SyntaxGym(evaluate.EvaluationModule):
     """
             codebase_urls=["https://github.com/cpllab/syntaxgym-core"],
         )
+    def _compute(self, suite, model_id, batch_size=8, add_start_token=False, device=None) -> SyntaxGymMetricResult:
         if device is not None:
             assert device in ["gpu", "cpu", "cuda"]
             if device == "gpu":
         model = model.to(device)
         model.eval()
+        tokenizer, tokenizer_kwargs = prepare_tokenizer(model, batch_size, add_start_token)
         results = {"prediction_results": [], "region_totals": []}
         # TODO batch all items together
         for item in datasets.logging.tqdm(suite):
+            result_single = self._compute_single(item, tokenizer, tokenizer_kwargs,
+                                                 model, device)
             for k in ["prediction_results", "region_totals"]:
                 results[k].append(result_single[k])
         return results
+    def _compute_single(self, item, tokenizer, tokenizer_kwargs, model, device):
         tokenized = tokenizer(item["conditions"]["content"],
                               return_tensors="pt",
+                              return_offsets_mapping=True,
+                              **tokenizer_kwargs).to(device)
         # input_ids: B * T
         input_ids = tokenized["input_ids"]
         assert input_ids.ndim == 2
         # Compute sentence level surprisals.
+        # TODO support sentences which exceed truncation length
         with torch.no_grad():
             # Pre-softmax predictive distribution B * T * V
             logits = model(input_ids).logits
         # reindexed surprisals: B * (T - 1)
         surprisals = torch.gather(surps_shifted, 2, expected_ids.unsqueeze(2)) \
             .squeeze(2)
         # surprisals is now B * (T - 1)