Spaces:
Sleeping
Sleeping
Jon Gauthier
commited on
Commit
·
0d58633
1
Parent(s):
dadceff
fall back to GPT2TokenizerFast for models which don't have a fast tokenizer (like OPT)
Browse files- syntaxgym.py +14 -1
syntaxgym.py
CHANGED
@@ -14,16 +14,21 @@
|
|
14 |
"""TODO: Add a description here."""
|
15 |
|
16 |
from collections import defaultdict
|
|
|
17 |
from typing import List, Dict, Tuple, NamedTuple
|
18 |
|
19 |
import datasets
|
20 |
import evaluate
|
21 |
import numpy as np
|
22 |
import torch
|
23 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM,
|
|
|
|
|
24 |
|
25 |
from .prediction import Prediction
|
26 |
|
|
|
|
|
27 |
|
28 |
_CITATION = """\
|
29 |
@inproceedings{Hu:et-al:2020,
|
@@ -108,7 +113,15 @@ def prepare_tokenizer(model, batch_size, add_start_token=True) -> Tuple[PreTrain
|
|
108 |
tokenizer:
|
109 |
tokenizer_kwargs: suggested kwargs for any tokenizer calls
|
110 |
"""
|
|
|
111 |
tokenizer = AutoTokenizer.from_pretrained(model.name_or_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
|
113 |
# if batch_size > 1 (which generally leads to padding being required), and
|
114 |
# if there is not an already assigned pad_token, assign an existing
|
|
|
14 |
"""TODO: Add a description here."""
|
15 |
|
16 |
from collections import defaultdict
|
17 |
+
import logging
|
18 |
from typing import List, Dict, Tuple, NamedTuple
|
19 |
|
20 |
import datasets
|
21 |
import evaluate
|
22 |
import numpy as np
|
23 |
import torch
|
24 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, \
|
25 |
+
PreTrainedTokenizer, PreTrainedTokenizerFast, \
|
26 |
+
GPT2TokenizerFast
|
27 |
|
28 |
from .prediction import Prediction
|
29 |
|
30 |
+
L = logging.getLogger(__name__)
|
31 |
+
|
32 |
|
33 |
_CITATION = """\
|
34 |
@inproceedings{Hu:et-al:2020,
|
|
|
113 |
tokenizer:
|
114 |
tokenizer_kwargs: suggested kwargs for any tokenizer calls
|
115 |
"""
|
116 |
+
|
117 |
tokenizer = AutoTokenizer.from_pretrained(model.name_or_path)
|
118 |
+
if not isinstance(tokenizer, PreTrainedTokenizerFast):
|
119 |
+
# We need a fast tokenizer because these are the only tokenizers that support
|
120 |
+
# return_offsets_mapping. Try to use GPT2 tokenizer -- this is sufficient for
|
121 |
+
# OPT.
|
122 |
+
L.warning(f"The model {model.name_or_path} does not have a fast tokenizer, "
|
123 |
+
f"which is required for this metric. Running with GPT2 tokenizer.")
|
124 |
+
tokenizer = GPT2TokenizerFast.from_pretrained(model.name_or_path)
|
125 |
|
126 |
# if batch_size > 1 (which generally leads to padding being required), and
|
127 |
# if there is not an already assigned pad_token, assign an existing
|