Spaces:
Sleeping
Sleeping
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""This is an implementation of the `CodeEval` metric that uses `RestrictedPython` | |
to exectue the untrusted code returned by the model. | |
Lightly adapted and mostly copied verbatim from the implementation in `evaluate`. | |
""" | |
import contextlib | |
import faulthandler | |
import itertools | |
import io | |
import multiprocessing | |
import os | |
import platform | |
import signal | |
import tempfile | |
from collections import Counter, defaultdict | |
from concurrent.futures import ThreadPoolExecutor, as_completed | |
import evaluate | |
# from evaluate.metrics import code_eval | |
import datasets | |
import numpy as np | |
from RestrictedPython import compile_restricted, safe_builtins, limited_builtins, utility_builtins | |
# TODO: Add BibTeX citation | |
_CITATION = """\ | |
@InProceedings{huggingface:module, | |
title = {A great new module}, | |
authors={huggingface, Inc.}, | |
year={2020} | |
} | |
""" | |
# TODO: Add description of the module here | |
_DESCRIPTION = """\ | |
This module tries to extend the built in `code_eval` module to use restricted python. | |
""" | |
# TODO: Add description of the arguments of the module here | |
_KWARGS_DESCRIPTION = """ | |
Calculates how good are predictions given some references, using certain scores | |
Args: | |
predictions: list of candidates to evaluate. Each candidates should be a list | |
of strings with several code candidates to solve the problem. | |
references: a list with a test for each prediction. Each test should evaluate the | |
correctness of a code candidate. | |
k: number of code candidates to consider in the evaluation (Default: [1, 10, 100]) | |
num_workers: number of workers used to evaluate the canidate programs (Default: 4). | |
timeout: | |
use_safe_builtins: a bool indicating whether to use the `RestrictedPython.safe_builtins` | |
use_limited_builtins: a bool indicating whether to use the `RestrictedPython.limited_builtins` | |
use_utility_builtins: a bool indicating whether to use the `RestrictedPython.utility_builtins` | |
Returns: | |
pass_at_k: dict with pass rates for each k | |
results: dict with granular results of each unittest | |
Examples: | |
>>> code_eval = evaluate.load("code_eval") | |
>>> test_cases = ["assert add(2,3)==5"] | |
>>> candidates = [["def add(a,b): return a*b", "def add(a, b): return a+b"]] | |
>>> pass_at_k, results = code_eval.compute(references=test_cases, predictions=candidates, k=[1, 2]) | |
>>> print(pass_at_k) | |
{'pass@1': 0.5, 'pass@2': 1.0} | |
""" | |
_WARNING = """ | |
################################################################################ | |
!!!WARNING!!! | |
################################################################################ | |
The "code_eval" metric executes untrusted model-generated code in Python. | |
Although it is highly unlikely that model-generated code will do something | |
overtly malicious in response to this test suite, model-generated code may act | |
destructively due to a lack of model capability or alignment. | |
Users are strongly encouraged to sandbox this evaluation suite so that it | |
does not perform destructive actions on their host or network. For more | |
information on how OpenAI sandboxes its code, see the paper "Evaluating Large | |
Language Models Trained on Code" (https://arxiv.org/abs/2107.03374). | |
Once you have read this disclaimer and taken appropriate precautions, | |
set the environment variable HF_ALLOW_CODE_EVAL="1". Within Python you can to this | |
with: | |
import os | |
os.environ["HF_ALLOW_CODE_EVAL"] = "1" | |
################################################################################\ | |
""" | |
# TODO: who has the copyright? | |
_LICENSE = """The MIT License | |
Copyright (c) OpenAI (https://openai.com) | |
Permission is hereby granted, free of charge, to any person obtaining a copy | |
of this software and associated documentation files (the "Software"), to deal | |
in the Software without restriction, including without limitation the rights | |
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
copies of the Software, and to permit persons to whom the Software is | |
furnished to do so, subject to the following conditions: | |
The above copyright notice and this permission notice shall be included in | |
all copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
THE SOFTWARE.""" | |
class RestrictedPythonCodeEval(evaluate.Metric): | |
"""Exactly the same as the built in `code_eval` module, but using restricted python""" | |
def _info(self): | |
# TODO: Specifies the evaluate.EvaluationModuleInfo object | |
return evaluate.MetricInfo( | |
# This is the description that will appear on the modules page. | |
module_type="metric", | |
description=_DESCRIPTION, | |
citation=_CITATION, | |
inputs_description=_KWARGS_DESCRIPTION, | |
# This defines the format of each prediction and reference | |
features=datasets.Features({ | |
'predictions': datasets.Sequence(datasets.Value("string")), | |
'references': datasets.Value('string'), | |
}), | |
# Homepage of the module for documentation | |
homepage="http://module.homepage", | |
# Additional links to the codebase or references | |
codebase_urls=["http://github.com/path/to/codebase/of/new_module"], | |
reference_urls=["http://path.to.reference.url/new_module"] | |
) | |
def _compute(self, predictions, references, k=[1, 10, 100], num_workers=4, timeout=3.0, | |
use_safe_builtins: bool = True, use_limited_builtins: bool = True, use_utility_builtins: bool = True): | |
"""Returns the scores""" | |
if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1": | |
raise ValueError(_WARNING) | |
if os.name == "nt": | |
raise NotImplementedError("This metric is currently not supported on Windows.") | |
with ThreadPoolExecutor(max_workers=num_workers) as executor: | |
futures = [] | |
completion_id = Counter() | |
n_samples = 0 | |
results = defaultdict(list) | |
for task_id, (candidates, test_case) in enumerate(zip(predictions, references)): | |
for candidate in candidates: | |
test_program = candidate + "\n" + test_case | |
args = (test_program, timeout, task_id, completion_id[task_id], use_safe_builtins, use_limited_builtins, use_utility_builtins) | |
future = executor.submit(_check_correctness, *args) | |
futures.append(future) | |
completion_id[task_id] += 1 | |
n_samples += 1 | |
for future in as_completed(futures): | |
result = future.result() | |
results[result["task_id"]].append((result["completion_id"], result)) | |
total, correct = [], [] | |
for result in results.values(): | |
result.sort() | |
passed = [r[1]["passed"] for r in result] | |
total.append(len(passed)) | |
correct.append(sum(passed)) | |
total = np.array(total) | |
correct = np.array(correct) | |
ks = k | |
pass_at_k = {f"pass@{k}": estimate_pass_at_k(total, correct, k).mean() for k in ks if (total >= k).all()} | |
return pass_at_k, results | |
def estimate_pass_at_k(num_samples, num_correct, k): | |
"""Estimates pass@k of each problem and returns them in an array.""" | |
def estimator(n: int, c: int, k: int) -> float: | |
"""Calculates 1 - comb(n - c, k) / comb(n, k).""" | |
if n - c < k: | |
return 1.0 | |
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)) # type: ignore | |
if isinstance(num_samples, int): | |
num_samples_it = itertools.repeat(num_samples, len(num_correct)) | |
else: | |
assert len(num_samples) == len(num_correct) | |
num_samples_it = iter(num_samples) | |
return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)]) | |
def _check_correctness(check_program, timeout, task_id, completion_id, | |
use_safe_builtins: bool = True, use_limited_builtins: bool = True, use_utility_builtins: bool = True): | |
""" | |
Evaluates the functional correctness of a completion by running the test | |
suite provided in the problem. | |
:param completion_id: an optional completion ID so we can match | |
the results later even if execution finishes asynchronously. | |
""" | |
manager = multiprocessing.Manager() | |
result = manager.list() | |
p = multiprocessing.Process(target=_unsafe_execute, args=(check_program, result, timeout, use_safe_builtins, use_limited_builtins, use_utility_builtins)) | |
p.start() | |
p.join(timeout=timeout + 1) | |
if p.is_alive(): | |
p.kill() | |
if not result: | |
result.append("timed out") | |
return dict( | |
task_id=task_id, | |
passed=result[0] == "passed", | |
result=result[0], | |
completion_id=completion_id, | |
) | |
def _unsafe_execute(check_program, result, timeout, | |
use_safe_builtins: bool = True, use_limited_builtins: bool = True, use_utility_builtins: bool = True): | |
with create_tempdir(): | |
# These system calls are needed when cleaning up tempdir. | |
import os | |
import shutil | |
rmtree = shutil.rmtree | |
rmdir = os.rmdir | |
chdir = os.chdir | |
# Disable functionalities that can make destructive changes to the test. | |
reliability_guard() | |
# Run program. | |
try: | |
builtins = {} | |
if use_safe_builtins: | |
builtins.update(safe_builtins) | |
if use_limited_builtins: | |
builtins.update(limited_builtins) | |
if use_utility_builtins: | |
builtins.update(utility_builtins) | |
exec_globals = {'__builtins__': builtins} | |
with swallow_io(): | |
with time_limit(timeout): | |
byte_code = compile_restricted(check_program, filename="<model output>", mode="exec") | |
exec(byte_code, exec_globals, None) | |
result.append("passed") | |
except TimeoutException: | |
result.append("timed out") | |
except BaseException as e: | |
result.append(f"failed: {e}") | |
# Needed for cleaning up. | |
shutil.rmtree = rmtree | |
os.rmdir = rmdir | |
os.chdir = chdir | |
def time_limit(seconds): | |
def signal_handler(signum, frame): | |
raise TimeoutException("Timed out!") | |
signal.setitimer(signal.ITIMER_REAL, seconds) | |
signal.signal(signal.SIGALRM, signal_handler) | |
try: | |
yield | |
finally: | |
signal.setitimer(signal.ITIMER_REAL, 0) | |
def swallow_io(): | |
stream = WriteOnlyStringIO() | |
with contextlib.redirect_stdout(stream): | |
with contextlib.redirect_stderr(stream): | |
with redirect_stdin(stream): | |
yield | |
def create_tempdir(): | |
with tempfile.TemporaryDirectory() as dirname: | |
with chdir(dirname): | |
yield dirname | |
class TimeoutException(Exception): | |
pass | |
class WriteOnlyStringIO(io.StringIO): | |
"""StringIO that throws an exception when it's read from""" | |
def read(self, *args, **kwargs): | |
raise OSError | |
def readline(self, *args, **kwargs): | |
raise OSError | |
def readlines(self, *args, **kwargs): | |
raise OSError | |
def readable(self, *args, **kwargs): | |
"""Returns True if the IO object can be read.""" | |
return False | |
class redirect_stdin(contextlib._RedirectStream): # type: ignore | |
_stream = "stdin" | |
def chdir(root): | |
if root == ".": | |
yield | |
return | |
cwd = os.getcwd() | |
os.chdir(root) | |
try: | |
yield | |
except BaseException as exc: | |
raise exc | |
finally: | |
os.chdir(cwd) | |
def reliability_guard(maximum_memory_bytes=None): | |
""" | |
This disables various destructive functions and prevents the generated code | |
from interfering with the test (e.g. fork bomb, killing other processes, | |
removing filesystem files, etc.) | |
WARNING | |
This function is NOT a security sandbox. Untrusted code, including, model- | |
generated code, should not be blindly executed outside of one. See the | |
Codex paper for more information about OpenAI's code sandbox, and proceed | |
with caution. | |
""" | |
if maximum_memory_bytes is not None: | |
import resource | |
resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes)) | |
resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes)) | |
if not platform.uname().system == "Darwin": | |
resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes)) | |
faulthandler.disable() | |
import builtins | |
builtins.exit = None | |
builtins.quit = None | |
import os | |
os.environ["OMP_NUM_THREADS"] = "1" | |
os.kill = None | |
os.system = None | |
os.putenv = None | |
os.remove = None | |
os.removedirs = None | |
os.rmdir = None | |
os.fchdir = None | |
os.setuid = None | |
os.fork = None | |
os.forkpty = None | |
os.killpg = None | |
os.rename = None | |
os.renames = None | |
os.truncate = None | |
os.replace = None | |
os.unlink = None | |
os.fchmod = None | |
os.fchown = None | |
os.chmod = None | |
os.chown = None | |
os.chroot = None | |
os.fchdir = None | |
os.lchflags = None | |
os.lchmod = None | |
os.lchown = None | |
os.getcwd = None | |
os.chdir = None | |
import shutil | |
shutil.rmtree = None | |
shutil.move = None | |
shutil.chown = None | |
import subprocess | |
subprocess.Popen = None # type: ignore | |
__builtins__["help"] = None | |
import sys | |
sys.modules["ipdb"] = None # type: ignore | |
sys.modules["joblib"] = None # type: ignore | |
sys.modules["resource"] = None # type: ignore | |
sys.modules["psutil"] = None # type: ignore | |
sys.modules["tkinter"] = None # type: ignore |