|
|
|
"""TODO: Add a description here.""" |
|
import collections |
|
import os |
|
|
|
from typing import Literal |
|
|
|
import concurrent.futures |
|
import datasets |
|
import evaluate |
|
import itertools |
|
import numpy as np |
|
from tqdm import tqdm |
|
|
|
from .execute import check_correctness |
|
|
|
|
|
_CITATION = """\ |
|
@InProceedings{huggingface:module, |
|
title = {A great new module}, |
|
authors={huggingface, Inc.}, |
|
year={2020} |
|
} |
|
""" |
|
|
|
|
|
_DESCRIPTION = """\ |
|
This new module is designed to solve this great ML task and is crafted with a lot of care. |
|
""" |
|
|
|
|
|
|
|
_KWARGS_DESCRIPTION = """ |
|
Calculates how good are predictions given some references, using certain scores |
|
Args: |
|
predictions: list of predictions to score. Each predictions |
|
should be a string with tokens separated by spaces. |
|
references: list of reference for each prediction. Each |
|
reference should be a string with tokens separated by spaces. |
|
Returns: |
|
accuracy: description of the first score, |
|
another_score: description of the second score, |
|
Examples: |
|
Examples should be written in doctest format, and should illustrate how |
|
to use the function. |
|
|
|
>>> my_new_module = evaluate.load("my_new_module") |
|
>>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1]) |
|
>>> print(results) |
|
{'accuracy': 1.0} |
|
""" |
|
|
|
_WARNING = """ |
|
################################################################################ |
|
!!!WARNING!!! |
|
################################################################################ |
|
The "code_eval" metric executes untrusted model-generated code in Python. |
|
Although it is highly unlikely that model-generated code will do something |
|
overtly malicious in response to this test suite, model-generated code may act |
|
destructively due to a lack of model capability or alignment. |
|
Users are strongly encouraged to sandbox this evaluation suite so that it |
|
does not perform destructive actions on their host or network. For more |
|
information on how OpenAI sandboxes its code, see the paper "Evaluating Large |
|
Language Models Trained on Code" (https://arxiv.org/abs/2107.03374). |
|
Once you have read this disclaimer and taken appropriate precautions, |
|
set the environment variable HF_ALLOW_CODE_EVAL="1". Within Python you can to this |
|
with: |
|
>>> import os |
|
>>> os.environ["HF_ALLOW_CODE_EVAL"] = "1" |
|
################################################################################\ |
|
""" |
|
|
|
_CLANG_WARNING = """ |
|
Please provide the environment variable 'GENERICIFY_CLANG' with the path of the |
|
clang++ compiler. Version 15+ is required. Within Python you can to this |
|
with: |
|
>>> import os |
|
>>> os.environ["GENERICIFY_CLANG"] = "/path/to/clang++" |
|
""" |
|
|
|
|
|
BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt" |
|
|
|
|
|
@evaluate.utils.file_utils.add_start_docstrings( |
|
_DESCRIPTION, _KWARGS_DESCRIPTION |
|
) |
|
class EvaluateGenericifyCpp(evaluate.Metric): |
|
"""TODO: Short description of my evaluation module.""" |
|
|
|
def _info(self): |
|
|
|
return evaluate.MetricInfo( |
|
|
|
description=_DESCRIPTION, |
|
citation=_CITATION, |
|
inputs_description=_KWARGS_DESCRIPTION, |
|
|
|
features=datasets.Features( |
|
{ |
|
"predictions": datasets.Sequence(datasets.Value("string")), |
|
"references": datasets.Features( |
|
{ |
|
"tests": datasets.Value("string"), |
|
"invalids": datasets.Value("string"), |
|
} |
|
), |
|
} |
|
), |
|
|
|
homepage="http://module.homepage", |
|
|
|
codebase_urls=["http://github.com/path/to/codebase/of/new_module"], |
|
reference_urls=["http://path.to.reference.url/new_module"], |
|
) |
|
|
|
def _compute( |
|
self, |
|
*, |
|
predictions, |
|
references, |
|
cpp_type: Literal["base", "sfinae", "concepts"], |
|
k=[1, 10, 100], |
|
): |
|
"""Returns the scores""" |
|
num_workers = os.cpu_count() |
|
num_workers = num_workers if not num_workers else 8 |
|
|
|
if os.getenv("HF_ALLOW_CODE_EVAL", default=0) != "1": |
|
raise ValueError(_WARNING) |
|
|
|
if os.getenv("GENERICIFY_CLANG", default=0) == 0: |
|
raise ValueError(_CLANG_WARNING) |
|
|
|
if os.name == "nt": |
|
raise NotImplementedError( |
|
"This metric is currently not supported on Windows." |
|
) |
|
|
|
total_predictions = sum(map(len, predictions)) |
|
|
|
with concurrent.futures.ThreadPoolExecutor( |
|
max_workers=num_workers |
|
) as executor: |
|
futures = [] |
|
completion_id = collections.Counter() |
|
results = collections.defaultdict(list) |
|
|
|
for task_id, (candidates, reference) in enumerate( |
|
zip(predictions, references) |
|
): |
|
for candidate in candidates: |
|
args = ( |
|
candidate, |
|
reference, |
|
cpp_type, |
|
task_id, |
|
completion_id[task_id], |
|
) |
|
future = executor.submit(check_correctness, *args) |
|
futures.append(future) |
|
completion_id[task_id] += 1 |
|
|
|
for future in tqdm( |
|
concurrent.futures.as_completed(futures), |
|
desc="Evaluating", |
|
total=total_predictions, |
|
): |
|
result = future.result() |
|
results[result["task_id"]].append( |
|
(result["completion_id"], result) |
|
) |
|
|
|
totals = collections.defaultdict(list) |
|
corrects = collections.defaultdict(list) |
|
|
|
keys = { |
|
"base": [ |
|
"base_run_passed", |
|
"base_run_compiled", |
|
], |
|
"sfinae": [ |
|
"sfinae_run_passed", |
|
"sfinae_run_compiled", |
|
"sfinae_constrain_passed", |
|
], |
|
"concepts": [ |
|
"concepts_run_passed", |
|
"concepts_run_compiled", |
|
"concepts_constrain_passed", |
|
], |
|
}[cpp_type] |
|
for result in results.values(): |
|
result.sort() |
|
for pt in keys: |
|
passed = [r[1][pt] for r in result] |
|
totals[pt].append(len(passed)) |
|
corrects[pt].append(sum(passed)) |
|
|
|
totals = {k: np.array(v) for k, v in totals.items()} |
|
corrects = {k: np.array(v) for k, v in corrects.items()} |
|
|
|
ks = k |
|
pass_at_k = { |
|
f"{key}@{k}": estimate_pass_at_k( |
|
totals[key], |
|
corrects[key], |
|
k, |
|
).mean() |
|
for key in totals.keys() |
|
for k in ks |
|
if (totals[key] >= k).all() |
|
} |
|
|
|
return pass_at_k, results |
|
|
|
|
|
def estimate_pass_at_k(num_samples, num_correct, k) -> np.array: |
|
"""Estimates pass@k of each problem and returns them in an array.""" |
|
|
|
def estimator(n: int, c: int) -> float: |
|
"""Calculates 1 - comb(n - c, k) / comb(n, k).""" |
|
if n - c < k: |
|
return 1.0 |
|
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)) |
|
|
|
if isinstance(num_samples, int): |
|
num_samples_it = itertools.repeat(num_samples, len(num_correct)) |
|
else: |
|
assert len(num_samples) == len(num_correct) |
|
num_samples_it = iter(num_samples) |
|
|
|
return np.array( |
|
[estimator(int(n), int(c)) for n, c in zip(num_samples_it, num_correct)] |
|
) |
|
|