guydav commited on
Commit
4fcd593
1 Parent(s): fda1312

First pass at the restricted python code eval

Browse files
Files changed (2) hide show
  1. requirements.txt +2 -1
  2. restrictedpython_code_eval.py +366 -30
requirements.txt CHANGED
@@ -1 +1,2 @@
1
- git+https://github.com/huggingface/evaluate@main
 
 
1
+ git+https://github.com/huggingface/evaluate@main
2
+ RestrictedPython
restrictedpython_code_eval.py CHANGED
@@ -11,10 +11,29 @@
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
  # See the License for the specific language governing permissions and
13
  # limitations under the License.
14
- """TODO: Add a description here."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  import evaluate
 
17
  import datasets
 
 
18
 
19
 
20
  # TODO: Add BibTeX citation
@@ -28,7 +47,7 @@ year={2020}
28
 
29
  # TODO: Add description of the module here
30
  _DESCRIPTION = """\
31
- This new module is designed to solve this great ML task and is crafted with a lot of care.
32
  """
33
 
34
 
@@ -36,30 +55,77 @@ This new module is designed to solve this great ML task and is crafted with a lo
36
  _KWARGS_DESCRIPTION = """
37
  Calculates how good are predictions given some references, using certain scores
38
  Args:
39
- predictions: list of predictions to score. Each predictions
40
- should be a string with tokens separated by spaces.
41
- references: list of reference for each prediction. Each
42
- reference should be a string with tokens separated by spaces.
 
 
 
 
 
 
43
  Returns:
44
- accuracy: description of the first score,
45
- another_score: description of the second score,
46
  Examples:
47
- Examples should be written in doctest format, and should illustrate how
48
- to use the function.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
- >>> my_new_module = evaluate.load("my_new_module")
51
- >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
52
- >>> print(results)
53
- {'accuracy': 1.0}
 
 
 
 
54
  """
55
 
56
- # TODO: Define external resources urls if needed
57
- BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
 
 
 
 
 
 
 
 
 
 
 
 
58
 
 
 
 
 
 
 
 
59
 
60
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
61
  class RestrictedPythonCodeEval(evaluate.Metric):
62
- """TODO: Short description of my evaluation module."""
63
 
64
  def _info(self):
65
  # TODO: Specifies the evaluate.EvaluationModuleInfo object
@@ -71,8 +137,8 @@ class RestrictedPythonCodeEval(evaluate.Metric):
71
  inputs_description=_KWARGS_DESCRIPTION,
72
  # This defines the format of each prediction and reference
73
  features=datasets.Features({
74
- 'predictions': datasets.Value('int64'),
75
- 'references': datasets.Value('int64'),
76
  }),
77
  # Homepage of the module for documentation
78
  homepage="http://module.homepage",
@@ -81,15 +147,285 @@ class RestrictedPythonCodeEval(evaluate.Metric):
81
  reference_urls=["http://path.to.reference.url/new_module"]
82
  )
83
 
84
- def _download_and_prepare(self, dl_manager):
85
- """Optional: download external resources useful to compute the scores"""
86
- # TODO: Download external resources if needed
87
- pass
88
-
89
- def _compute(self, predictions, references):
90
  """Returns the scores"""
91
- # TODO: Compute the different scores of the module
92
- accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
93
- return {
94
- "accuracy": accuracy,
95
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
  # See the License for the specific language governing permissions and
13
  # limitations under the License.
14
+ """This is an implementation of the `CodeEval` metric that uses `RestrictedPython`
15
+ to exectue the untrusted code returned by the model.
16
+ Lightly adapted and mostly copied verbatim from the implementation in `evaluate`.
17
+ """
18
+
19
+ import contextlib
20
+ import faulthandler
21
+ import itertools
22
+ import io
23
+ import multiprocessing
24
+ import os
25
+ import platform
26
+ import signal
27
+ import tempfile
28
+
29
+ from collections import Counter, defaultdict
30
+ from concurrent.futures import ThreadPoolExecutor, as_completed
31
 
32
  import evaluate
33
+ # from evaluate.metrics import code_eval
34
  import datasets
35
+ import numpy as np
36
+ from RestrictedPython import compile_restricted, safe_builtins, limited_builtins, utility_builtins
37
 
38
 
39
  # TODO: Add BibTeX citation
 
47
 
48
  # TODO: Add description of the module here
49
  _DESCRIPTION = """\
50
+ This module tries to extend the built in `code_eval` module to use restricted python.
51
  """
52
 
53
 
 
55
  _KWARGS_DESCRIPTION = """
56
  Calculates how good are predictions given some references, using certain scores
57
  Args:
58
+ predictions: list of candidates to evaluate. Each candidates should be a list
59
+ of strings with several code candidates to solve the problem.
60
+ references: a list with a test for each prediction. Each test should evaluate the
61
+ correctness of a code candidate.
62
+ k: number of code candidates to consider in the evaluation (Default: [1, 10, 100])
63
+ num_workers: number of workers used to evaluate the canidate programs (Default: 4).
64
+ timeout:
65
+ use_safe_builtins: a bool indicating whether to use the `RestrictedPython.safe_builtins`
66
+ use_limited_builtins: a bool indicating whether to use the `RestrictedPython.limited_builtins`
67
+ use_utility_builtins: a bool indicating whether to use the `RestrictedPython.utility_builtins`
68
  Returns:
69
+ pass_at_k: dict with pass rates for each k
70
+ results: dict with granular results of each unittest
71
  Examples:
72
+ >>> code_eval = evaluate.load("code_eval")
73
+ >>> test_cases = ["assert add(2,3)==5"]
74
+ >>> candidates = [["def add(a,b): return a*b", "def add(a, b): return a+b"]]
75
+ >>> pass_at_k, results = code_eval.compute(references=test_cases, predictions=candidates, k=[1, 2])
76
+ >>> print(pass_at_k)
77
+ {'pass@1': 0.5, 'pass@2': 1.0}
78
+ """
79
+
80
+ _WARNING = """
81
+ ################################################################################
82
+ !!!WARNING!!!
83
+ ################################################################################
84
+ The "code_eval" metric executes untrusted model-generated code in Python.
85
+ Although it is highly unlikely that model-generated code will do something
86
+ overtly malicious in response to this test suite, model-generated code may act
87
+ destructively due to a lack of model capability or alignment.
88
+ Users are strongly encouraged to sandbox this evaluation suite so that it
89
+ does not perform destructive actions on their host or network. For more
90
+ information on how OpenAI sandboxes its code, see the paper "Evaluating Large
91
+ Language Models Trained on Code" (https://arxiv.org/abs/2107.03374).
92
 
93
+ Once you have read this disclaimer and taken appropriate precautions,
94
+ set the environment variable HF_ALLOW_CODE_EVAL="1". Within Python you can to this
95
+ with:
96
+
97
+ >>> import os
98
+ >>> os.environ["HF_ALLOW_CODE_EVAL"] = "1"
99
+
100
+ ################################################################################\
101
  """
102
 
103
+ # TODO: who has the copyright?
104
+ _LICENSE = """The MIT License
105
+
106
+ Copyright (c) OpenAI (https://openai.com)
107
+
108
+ Permission is hereby granted, free of charge, to any person obtaining a copy
109
+ of this software and associated documentation files (the "Software"), to deal
110
+ in the Software without restriction, including without limitation the rights
111
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
112
+ copies of the Software, and to permit persons to whom the Software is
113
+ furnished to do so, subject to the following conditions:
114
+
115
+ The above copyright notice and this permission notice shall be included in
116
+ all copies or substantial portions of the Software.
117
 
118
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
119
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
120
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
121
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
122
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
123
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
124
+ THE SOFTWARE."""
125
 
126
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
127
  class RestrictedPythonCodeEval(evaluate.Metric):
128
+ """Exactly the same as the built in `code_eval` module, but using restricted python"""
129
 
130
  def _info(self):
131
  # TODO: Specifies the evaluate.EvaluationModuleInfo object
 
137
  inputs_description=_KWARGS_DESCRIPTION,
138
  # This defines the format of each prediction and reference
139
  features=datasets.Features({
140
+ 'predictions': datasets.Sequence(datasets.Value("string")),
141
+ 'references': datasets.Value('string'),
142
  }),
143
  # Homepage of the module for documentation
144
  homepage="http://module.homepage",
 
147
  reference_urls=["http://path.to.reference.url/new_module"]
148
  )
149
 
150
+ def _compute(self, predictions, references, k=[1, 10, 100], num_workers=4, timeout=3.0,
151
+ use_safe_builtins: bool = True, use_limited_builtins: bool = True, use_utility_builtins: bool = True):
 
 
 
 
152
  """Returns the scores"""
153
+
154
+ if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1":
155
+ raise ValueError(_WARNING)
156
+
157
+ if os.name == "nt":
158
+ raise NotImplementedError("This metric is currently not supported on Windows.")
159
+
160
+ with ThreadPoolExecutor(max_workers=num_workers) as executor:
161
+ futures = []
162
+ completion_id = Counter()
163
+ n_samples = 0
164
+ results = defaultdict(list)
165
+
166
+ for task_id, (candidates, test_case) in enumerate(zip(predictions, references)):
167
+ for candidate in candidates:
168
+ test_program = candidate + "\n" + test_case
169
+ args = (test_program, timeout, task_id, completion_id[task_id], use_safe_builtins, use_limited_builtins, use_utility_builtins)
170
+ future = executor.submit(_check_correctness, *args)
171
+ futures.append(future)
172
+ completion_id[task_id] += 1
173
+ n_samples += 1
174
+
175
+ for future in as_completed(futures):
176
+ result = future.result()
177
+ results[result["task_id"]].append((result["completion_id"], result))
178
+
179
+ total, correct = [], []
180
+ for result in results.values():
181
+ result.sort()
182
+ passed = [r[1]["passed"] for r in result]
183
+ total.append(len(passed))
184
+ correct.append(sum(passed))
185
+ total = np.array(total)
186
+ correct = np.array(correct)
187
+
188
+ ks = k
189
+ pass_at_k = {f"pass@{k}": estimate_pass_at_k(total, correct, k).mean() for k in ks if (total >= k).all()}
190
+
191
+ return pass_at_k, results
192
+
193
+
194
+ def estimate_pass_at_k(num_samples, num_correct, k):
195
+ """Estimates pass@k of each problem and returns them in an array."""
196
+
197
+ def estimator(n: int, c: int, k: int) -> float:
198
+ """Calculates 1 - comb(n - c, k) / comb(n, k)."""
199
+ if n - c < k:
200
+ return 1.0
201
+ return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)) # type: ignore
202
+
203
+ if isinstance(num_samples, int):
204
+ num_samples_it = itertools.repeat(num_samples, len(num_correct))
205
+ else:
206
+ assert len(num_samples) == len(num_correct)
207
+ num_samples_it = iter(num_samples)
208
+
209
+ return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
210
+
211
+
212
+
213
+ def _check_correctness(check_program, timeout, task_id, completion_id,
214
+ use_safe_builtins: bool = True, use_limited_builtins: bool = True, use_utility_builtins: bool = True):
215
+ """
216
+ Evaluates the functional correctness of a completion by running the test
217
+ suite provided in the problem.
218
+
219
+ :param completion_id: an optional completion ID so we can match
220
+ the results later even if execution finishes asynchronously.
221
+ """
222
+ manager = multiprocessing.Manager()
223
+ result = manager.list()
224
+
225
+ p = multiprocessing.Process(target=_unsafe_execute, args=(check_program, result, timeout, use_safe_builtins, use_limited_builtins, use_utility_builtins))
226
+ p.start()
227
+ p.join(timeout=timeout + 1)
228
+ if p.is_alive():
229
+ p.kill()
230
+
231
+ if not result:
232
+ result.append("timed out")
233
+
234
+ return dict(
235
+ task_id=task_id,
236
+ passed=result[0] == "passed",
237
+ result=result[0],
238
+ completion_id=completion_id,
239
+ )
240
+
241
+ def _unsafe_execute(check_program, result, timeout,
242
+ use_safe_builtins: bool = True, use_limited_builtins: bool = True, use_utility_builtins: bool = True):
243
+
244
+ with create_tempdir():
245
+
246
+ # These system calls are needed when cleaning up tempdir.
247
+ import os
248
+ import shutil
249
+
250
+ rmtree = shutil.rmtree
251
+ rmdir = os.rmdir
252
+ chdir = os.chdir
253
+
254
+ # Disable functionalities that can make destructive changes to the test.
255
+ reliability_guard()
256
+
257
+ # Run program.
258
+ try:
259
+ builtins = {}
260
+ if use_safe_builtins:
261
+ builtins.update(safe_builtins)
262
+ if use_limited_builtins:
263
+ builtins.update(limited_builtins)
264
+ if use_utility_builtins:
265
+ builtins.update(utility_builtins)
266
+
267
+ exec_globals = {'__builtins__': builtins}
268
+ with swallow_io():
269
+ with time_limit(timeout):
270
+ byte_code = compile_restricted(check_program, filename="<model output>", mode="exec")
271
+ exec(byte_code, exec_globals, None)
272
+ result.append("passed")
273
+ except TimeoutException:
274
+ result.append("timed out")
275
+ except BaseException as e:
276
+ result.append(f"failed: {e}")
277
+
278
+ # Needed for cleaning up.
279
+ shutil.rmtree = rmtree
280
+ os.rmdir = rmdir
281
+ os.chdir = chdir
282
+
283
+
284
+ @contextlib.contextmanager
285
+ def time_limit(seconds):
286
+ def signal_handler(signum, frame):
287
+ raise TimeoutException("Timed out!")
288
+
289
+ signal.setitimer(signal.ITIMER_REAL, seconds)
290
+ signal.signal(signal.SIGALRM, signal_handler)
291
+ try:
292
+ yield
293
+ finally:
294
+ signal.setitimer(signal.ITIMER_REAL, 0)
295
+
296
+
297
+ @contextlib.contextmanager
298
+ def swallow_io():
299
+ stream = WriteOnlyStringIO()
300
+ with contextlib.redirect_stdout(stream):
301
+ with contextlib.redirect_stderr(stream):
302
+ with redirect_stdin(stream):
303
+ yield
304
+
305
+
306
+ @contextlib.contextmanager
307
+ def create_tempdir():
308
+ with tempfile.TemporaryDirectory() as dirname:
309
+ with chdir(dirname):
310
+ yield dirname
311
+
312
+
313
+ class TimeoutException(Exception):
314
+ pass
315
+
316
+
317
+ class WriteOnlyStringIO(io.StringIO):
318
+ """StringIO that throws an exception when it's read from"""
319
+
320
+ def read(self, *args, **kwargs):
321
+ raise OSError
322
+
323
+ def readline(self, *args, **kwargs):
324
+ raise OSError
325
+
326
+ def readlines(self, *args, **kwargs):
327
+ raise OSError
328
+
329
+ def readable(self, *args, **kwargs):
330
+ """Returns True if the IO object can be read."""
331
+ return False
332
+
333
+
334
+ class redirect_stdin(contextlib._RedirectStream): # type: ignore
335
+ _stream = "stdin"
336
+
337
+
338
+ @contextlib.contextmanager
339
+ def chdir(root):
340
+ if root == ".":
341
+ yield
342
+ return
343
+ cwd = os.getcwd()
344
+ os.chdir(root)
345
+ try:
346
+ yield
347
+ except BaseException as exc:
348
+ raise exc
349
+ finally:
350
+ os.chdir(cwd)
351
+
352
+
353
+ def reliability_guard(maximum_memory_bytes=None):
354
+ """
355
+ This disables various destructive functions and prevents the generated code
356
+ from interfering with the test (e.g. fork bomb, killing other processes,
357
+ removing filesystem files, etc.)
358
+
359
+ WARNING
360
+ This function is NOT a security sandbox. Untrusted code, including, model-
361
+ generated code, should not be blindly executed outside of one. See the
362
+ Codex paper for more information about OpenAI's code sandbox, and proceed
363
+ with caution.
364
+ """
365
+
366
+ if maximum_memory_bytes is not None:
367
+ import resource
368
+
369
+ resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
370
+ resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
371
+ if not platform.uname().system == "Darwin":
372
+ resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
373
+
374
+ faulthandler.disable()
375
+
376
+ import builtins
377
+
378
+ builtins.exit = None
379
+ builtins.quit = None
380
+
381
+ import os
382
+
383
+ os.environ["OMP_NUM_THREADS"] = "1"
384
+
385
+ os.kill = None
386
+ os.system = None
387
+ os.putenv = None
388
+ os.remove = None
389
+ os.removedirs = None
390
+ os.rmdir = None
391
+ os.fchdir = None
392
+ os.setuid = None
393
+ os.fork = None
394
+ os.forkpty = None
395
+ os.killpg = None
396
+ os.rename = None
397
+ os.renames = None
398
+ os.truncate = None
399
+ os.replace = None
400
+ os.unlink = None
401
+ os.fchmod = None
402
+ os.fchown = None
403
+ os.chmod = None
404
+ os.chown = None
405
+ os.chroot = None
406
+ os.fchdir = None
407
+ os.lchflags = None
408
+ os.lchmod = None
409
+ os.lchown = None
410
+ os.getcwd = None
411
+ os.chdir = None
412
+
413
+ import shutil
414
+
415
+ shutil.rmtree = None
416
+ shutil.move = None
417
+ shutil.chown = None
418
+
419
+ import subprocess
420
+
421
+ subprocess.Popen = None # type: ignore
422
+
423
+ __builtins__["help"] = None
424
+
425
+ import sys
426
+
427
+ sys.modules["ipdb"] = None # type: ignore
428
+ sys.modules["joblib"] = None # type: ignore
429
+ sys.modules["resource"] = None # type: ignore
430
+ sys.modules["psutil"] = None # type: ignore
431
+ sys.modules["tkinter"] = None # type: ignore