yu-val-weiss commited on
Commit
c26f589
·
1 Parent(s): 0f3b529

trust remote code

Browse files
Files changed (2) hide show
  1. README.md +2 -1
  2. blimp.py +11 -3
README.md CHANGED
@@ -47,7 +47,8 @@ results = blimp.compute(model_id='pico-lm/pico-decoder')
47
  - **batch_size** (int): the batch size to run texts through the model. Defaults to 16.
48
  - **predictions** (list[str]): names of metrics to run. pass empty list or `["*"]` to run all of them
49
  - **device** (str): device to run on, defaults to `cuda` when available
50
- - **samples_per_set** (int): the number of samples per metric, defaults to 1_000. Maximum 1_000 (enforced with a `min` call).
 
51
 
52
  ### Output Values
53
 
 
47
  - **batch_size** (int): the batch size to run texts through the model. Defaults to 16.
48
  - **predictions** (list[str]): names of metrics to run. pass empty list or `["*"]` to run all of them
49
  - **device** (str): device to run on, defaults to `cuda` when available
50
+ - **samples_per_set** (Optional[int]): the number of samples per metric. Maximum 1_000 (enforced with a `min` call). If None, defaults to 1000.
51
+ - **trust_remote_code** (bool): whether to trust datasets code , default False.
52
 
53
  ### Output Values
54
 
blimp.py CHANGED
@@ -125,6 +125,7 @@ Args:
125
  batch_size (int): the batch size to run texts through the model. Defaults to 16.
126
  device (str): device to run on, defaults to 'cuda' when available.
127
  samples_per_set (Optional[int]): the number of samples per phenomenon. Max is 1,000 (but will not error if higher value given.) If None, defaults to 1000.
 
128
 
129
  Returns:
130
  blimp: dictionary containing the blimp scores for each of the 67 sub-datasets, as well as the overall accuracy.
@@ -158,6 +159,7 @@ class Blimp(evaluate.Metric):
158
  batch_size: int = 16,
159
  device=None,
160
  samples_per_set: Optional[int] = None,
 
161
  ):
162
  if device is not None:
163
  assert device in ["gpu", "cpu", "cuda", "mps"], (
@@ -175,11 +177,15 @@ class Blimp(evaluate.Metric):
175
  if samples_per_set is None or samples_per_set <= 0:
176
  samples_per_set = 1000
177
 
178
- model = AutoModelForCausalLM.from_pretrained(model_id)
 
 
179
  model = model.to(device)
180
  model.eval()
181
 
182
- tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 
183
 
184
  # if batch_size > 1 (which generally leads to padding being required), and
185
  # if there is not an already assigned pad_token, assign an existing
@@ -213,7 +219,9 @@ class Blimp(evaluate.Metric):
213
  phenom_results = defaultdict(list)
214
 
215
  for category in logging.tqdm(blimp_sets, desc="Evaluating phenomena..."):
216
- dataset = datasets.load_dataset("nyu-mll/blimp", category)["train"]
 
 
217
 
218
  # Prepare batches of good and bad sentences
219
 
 
125
  batch_size (int): the batch size to run texts through the model. Defaults to 16.
126
  device (str): device to run on, defaults to 'cuda' when available.
127
  samples_per_set (Optional[int]): the number of samples per phenomenon. Max is 1,000 (but will not error if higher value given.) If None, defaults to 1000.
128
+ trust_remote_code (bool): whether to trust datasets code , default False.
129
 
130
  Returns:
131
  blimp: dictionary containing the blimp scores for each of the 67 sub-datasets, as well as the overall accuracy.
 
159
  batch_size: int = 16,
160
  device=None,
161
  samples_per_set: Optional[int] = None,
162
+ trust_remote_code: bool = False,
163
  ):
164
  if device is not None:
165
  assert device in ["gpu", "cpu", "cuda", "mps"], (
 
177
  if samples_per_set is None or samples_per_set <= 0:
178
  samples_per_set = 1000
179
 
180
+ model = AutoModelForCausalLM.from_pretrained(
181
+ model_id, trust_remote_code=trust_remote_code
182
+ )
183
  model = model.to(device)
184
  model.eval()
185
 
186
+ tokenizer = AutoTokenizer.from_pretrained(
187
+ model_id, trust_remote_code=trust_remote_code
188
+ )
189
 
190
  # if batch_size > 1 (which generally leads to padding being required), and
191
  # if there is not an already assigned pad_token, assign an existing
 
219
  phenom_results = defaultdict(list)
220
 
221
  for category in logging.tqdm(blimp_sets, desc="Evaluating phenomena..."):
222
+ dataset = datasets.load_dataset(
223
+ "nyu-mll/blimp", category, trust_remote_code=trust_remote_code
224
+ )["train"]
225
 
226
  # Prepare batches of good and bad sentences
227