yu-val-weiss
commited on
Commit
·
c26f589
1
Parent(s):
0f3b529
trust remote code
Browse files
README.md
CHANGED
@@ -47,7 +47,8 @@ results = blimp.compute(model_id='pico-lm/pico-decoder')
|
|
47 |
- **batch_size** (int): the batch size to run texts through the model. Defaults to 16.
|
48 |
- **predictions** (list[str]): names of metrics to run. pass empty list or `["*"]` to run all of them
|
49 |
- **device** (str): device to run on, defaults to `cuda` when available
|
50 |
-
- **samples_per_set** (int): the number of samples per metric
|
|
|
51 |
|
52 |
### Output Values
|
53 |
|
|
|
47 |
- **batch_size** (int): the batch size to run texts through the model. Defaults to 16.
|
48 |
- **predictions** (list[str]): names of metrics to run. pass empty list or `["*"]` to run all of them
|
49 |
- **device** (str): device to run on, defaults to `cuda` when available
|
50 |
+
- **samples_per_set** (Optional[int]): the number of samples per metric. Maximum 1_000 (enforced with a `min` call). If None, defaults to 1000.
|
51 |
+
- **trust_remote_code** (bool): whether to trust datasets code , default False.
|
52 |
|
53 |
### Output Values
|
54 |
|
blimp.py
CHANGED
@@ -125,6 +125,7 @@ Args:
|
|
125 |
batch_size (int): the batch size to run texts through the model. Defaults to 16.
|
126 |
device (str): device to run on, defaults to 'cuda' when available.
|
127 |
samples_per_set (Optional[int]): the number of samples per phenomenon. Max is 1,000 (but will not error if higher value given.) If None, defaults to 1000.
|
|
|
128 |
|
129 |
Returns:
|
130 |
blimp: dictionary containing the blimp scores for each of the 67 sub-datasets, as well as the overall accuracy.
|
@@ -158,6 +159,7 @@ class Blimp(evaluate.Metric):
|
|
158 |
batch_size: int = 16,
|
159 |
device=None,
|
160 |
samples_per_set: Optional[int] = None,
|
|
|
161 |
):
|
162 |
if device is not None:
|
163 |
assert device in ["gpu", "cpu", "cuda", "mps"], (
|
@@ -175,11 +177,15 @@ class Blimp(evaluate.Metric):
|
|
175 |
if samples_per_set is None or samples_per_set <= 0:
|
176 |
samples_per_set = 1000
|
177 |
|
178 |
-
model = AutoModelForCausalLM.from_pretrained(
|
|
|
|
|
179 |
model = model.to(device)
|
180 |
model.eval()
|
181 |
|
182 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
|
|
|
|
183 |
|
184 |
# if batch_size > 1 (which generally leads to padding being required), and
|
185 |
# if there is not an already assigned pad_token, assign an existing
|
@@ -213,7 +219,9 @@ class Blimp(evaluate.Metric):
|
|
213 |
phenom_results = defaultdict(list)
|
214 |
|
215 |
for category in logging.tqdm(blimp_sets, desc="Evaluating phenomena..."):
|
216 |
-
dataset = datasets.load_dataset(
|
|
|
|
|
217 |
|
218 |
# Prepare batches of good and bad sentences
|
219 |
|
|
|
125 |
batch_size (int): the batch size to run texts through the model. Defaults to 16.
|
126 |
device (str): device to run on, defaults to 'cuda' when available.
|
127 |
samples_per_set (Optional[int]): the number of samples per phenomenon. Max is 1,000 (but will not error if higher value given.) If None, defaults to 1000.
|
128 |
+
trust_remote_code (bool): whether to trust datasets code , default False.
|
129 |
|
130 |
Returns:
|
131 |
blimp: dictionary containing the blimp scores for each of the 67 sub-datasets, as well as the overall accuracy.
|
|
|
159 |
batch_size: int = 16,
|
160 |
device=None,
|
161 |
samples_per_set: Optional[int] = None,
|
162 |
+
trust_remote_code: bool = False,
|
163 |
):
|
164 |
if device is not None:
|
165 |
assert device in ["gpu", "cpu", "cuda", "mps"], (
|
|
|
177 |
if samples_per_set is None or samples_per_set <= 0:
|
178 |
samples_per_set = 1000
|
179 |
|
180 |
+
model = AutoModelForCausalLM.from_pretrained(
|
181 |
+
model_id, trust_remote_code=trust_remote_code
|
182 |
+
)
|
183 |
model = model.to(device)
|
184 |
model.eval()
|
185 |
|
186 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
187 |
+
model_id, trust_remote_code=trust_remote_code
|
188 |
+
)
|
189 |
|
190 |
# if batch_size > 1 (which generally leads to padding being required), and
|
191 |
# if there is not an already assigned pad_token, assign an existing
|
|
|
219 |
phenom_results = defaultdict(list)
|
220 |
|
221 |
for category in logging.tqdm(blimp_sets, desc="Evaluating phenomena..."):
|
222 |
+
dataset = datasets.load_dataset(
|
223 |
+
"nyu-mll/blimp", category, trust_remote_code=trust_remote_code
|
224 |
+
)["train"]
|
225 |
|
226 |
# Prepare batches of good and bad sentences
|
227 |
|