Spaces:
Runtime error
Runtime error
import json | |
import multiprocessing | |
import os | |
import re | |
from collections import defaultdict | |
import torch | |
from accelerate import Accelerator | |
from accelerate.utils import set_seed | |
from arguments import HumanEvalArguments | |
from datasets import load_dataset, load_metric | |
from torch.utils.data import IterableDataset | |
from torch.utils.data.dataloader import DataLoader | |
from tqdm import tqdm | |
import transformers | |
from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, StoppingCriteria, StoppingCriteriaList | |
EOF_STRINGS = ["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif"] | |
class TokenizedDataset(IterableDataset): | |
"""Tokenize and preprocess the dataset | |
Multiple copies of the same prompt are sent sequentially. | |
See compute_code for more details. | |
""" | |
def __init__(self, tokenizer, dataset, n_tasks=None, n_copies=1): | |
self.tokenizer = tokenizer | |
self.dataset = dataset | |
self.n_tasks = len(dataset) if n_tasks is None else n_tasks | |
self.n_copies = n_copies | |
def __iter__(self): | |
prompts = [] | |
for task in range(self.n_tasks): | |
# without strip, the model generate commented codes ... | |
prompts.append(self.tokenizer.eos_token + self.dataset[task]["prompt"].strip()) | |
outputs = self.tokenizer(prompts, padding=True, return_tensors="pt") | |
for task in range(self.n_tasks): | |
for _ in range(self.n_copies): | |
yield { | |
"ids": outputs.input_ids[task], | |
"task_id": task, | |
"input_len": outputs.attention_mask[task].sum(), | |
} | |
class EndOfFunctionCriteria(StoppingCriteria): | |
"""Custom `StoppingCriteria` which checks if all generated functions in the batch are completed.""" | |
def __init__(self, start_length, eof_strings, tokenizer): | |
self.start_length = start_length | |
self.eof_strings = eof_strings | |
self.tokenizer = tokenizer | |
def __call__(self, input_ids, scores, **kwargs): | |
"""Returns true if all generated sequences contain any of the end-of-function strings.""" | |
decoded_generations = self.tokenizer.batch_decode(input_ids[:, self.start_length :]) | |
done = [] | |
for decoded_generation in decoded_generations: | |
done.append(any(stop_string in decoded_generation for stop_string in self.eof_strings)) | |
return all(done) | |
def remove_last_block(string): | |
"""Remove the last block of the code containing EOF_STRINGS""" | |
string_list = re.split("(%s)" % "|".join(EOF_STRINGS), string) | |
# last string should be "" | |
return "".join(string_list[:-2]) | |
def complete_code(accelerator, model, tokenizer, dataloader, n_tasks, batch_size=20, **gen_kwargs): | |
"""Generate multiple codes for each task in the dataset. This function leverage accelerator to distribute | |
the processing to multiple GPUs. | |
dataloader, a wrapper around a TokenizeDataset objectm is supposed to send all the prompts from | |
the evalution dataset to the modelm as the following: | |
[p_0_0, p_0_1, ..., p_0_nc-1, p_1_0, ..., p_nt-1_nc-1] | |
where nc is the number of copies of the prompt, and nt is the number of tasks. | |
nc is such that num_sample = nc * batch_size | |
Parameters | |
---------- | |
accelerator: Accelerator | |
model: transformers.PreTrainedModel | |
Code generation model. AutoTokenizer.from_pretrained(model_ckpt), ex model_ckpt = "lvwerra/codeparrot" | |
tokenizer: transformers.AutoTokenizer | |
The tokenizer used to train model | |
dataloader: DataLoader | |
The dataloader is a wrapper around a TokenizeDataset object. It is designed to be used with multiple GPUs. | |
n_tasks: int | |
The number of tasks in the dataset. It is used to determine the length of the output. | |
Should be aligned with the number of tasks in the TokenizeDataset. | |
batch_size: int | |
num_return_sequences per copy of the prompt such that num_sample = batch_size * n_copies | |
gen_kwargs: dict | |
Keyword arguments for the generation function of the model. | |
Returns | |
------- | |
code_gens: list of list of str, of length n_tasks | |
List of generated codes for each task. | |
Each element is a list of generated codes for each task, with length num_samples | |
""" | |
gen_token_dict = defaultdict(list) # dict of list of generated tokens | |
for step, batch in tqdm(enumerate(dataloader)): | |
with torch.no_grad(): | |
gen_kwargs["stopping_criteria"][0].start_length = batch["ids"].shape[-1] | |
generated_tokens = accelerator.unwrap_model(model).generate( | |
input_ids=batch["ids"][:, : batch["input_len"]], num_return_sequences=batch_size, **gen_kwargs | |
) | |
# each task is generated batch_size times | |
generated_tasks = batch["task_id"].repeat(batch_size) | |
generated_tokens = accelerator.pad_across_processes( | |
generated_tokens, dim=1, pad_index=tokenizer.pad_token_id | |
) | |
generated_tokens, generated_tasks = accelerator.gather((generated_tokens, generated_tasks)) | |
generated_tokens = generated_tokens.cpu().numpy() | |
generated_tasks = generated_tasks.cpu().numpy() | |
for task, generated_tokens in zip(generated_tasks, generated_tokens): | |
gen_token_dict[task].append(generated_tokens) | |
code_gens = [[] for _ in range(n_tasks)] | |
for task, generated_tokens in gen_token_dict.items(): | |
for s in generated_tokens: | |
gen_code = tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True) | |
code_gens[task].append(remove_last_block(gen_code)) | |
return code_gens | |
def main(): | |
# Setup configuration | |
parser = HfArgumentParser(HumanEvalArguments) | |
args = parser.parse_args() | |
transformers.logging.set_verbosity_error() | |
# enables code execution in code_eval metric | |
os.environ["HF_ALLOW_CODE_EVAL"] = args.HF_ALLOW_CODE_EVAL | |
# make sure tokenizer plays nice with multiprocessing | |
os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
if args.num_workers is None: | |
args.num_workers = multiprocessing.cpu_count() | |
# Use dataset load to feed to accelerate | |
accelerator = Accelerator() | |
set_seed(args.seed, device_specific=True) | |
# Load model and tokenizer | |
tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt) | |
tokenizer.pad_token = tokenizer.eos_token | |
model = AutoModelForCausalLM.from_pretrained(args.model_ckpt) | |
# Generation settings | |
gen_kwargs = { | |
"do_sample": args.do_sample, | |
"temperature": args.temperature, | |
"max_new_tokens": args.max_new_tokens, | |
"top_p": args.top_p, | |
"top_k": args.top_k, | |
"stopping_criteria": StoppingCriteriaList([EndOfFunctionCriteria(0, EOF_STRINGS, tokenizer)]), | |
} | |
# Load evaluation dataset and metric | |
human_eval = load_dataset("openai_humaneval") | |
code_eval_metric = load_metric("code_eval") | |
n_tasks = args.num_tasks if args.num_tasks is not None else len(human_eval["test"]) | |
n_copies = args.n_samples // args.batch_size | |
human_eval_tokenized = TokenizedDataset(tokenizer, human_eval["test"], n_copies=n_copies, n_tasks=n_tasks) | |
# do not confuse args.batch_size, which is actually the num_return_sequences | |
human_eval_loader = DataLoader(human_eval_tokenized, batch_size=1) | |
# Run a quick test to see if code evaluation is enabled | |
try: | |
_ = code_eval_metric.compute(references=[""], predictions=[[""]]) | |
except ValueError as exception: | |
print( | |
'Code evaluation not enabled. Read the warning below carefully and then use `--HF_ALLOW_CODE_EVAL="1"`' | |
" flag to enable code evaluation." | |
) | |
raise exception | |
model, human_eval_loader = accelerator.prepare(model, human_eval_loader) | |
generations = complete_code( | |
accelerator, | |
model, | |
tokenizer, | |
human_eval_loader, | |
n_tasks=n_tasks, | |
batch_size=args.batch_size, | |
**gen_kwargs, | |
) | |
if accelerator.is_main_process: | |
references = [] | |
for task in tqdm(range(n_tasks)): | |
test_func = human_eval["test"][task]["test"] | |
entry_point = f"check({human_eval['test'][task]['entry_point']})" | |
references.append("\n" + test_func + "\n" + entry_point) | |
# Evaluate completions with "code_eval" metric | |
pass_at_k, _ = code_eval_metric.compute( | |
references=references, predictions=generations, num_workers=args.num_workers | |
) | |
print(f"Results: {pass_at_k}") | |
# Save results to json file | |
with open(args.output_file, "w") as fp: | |
json.dump(pass_at_k, fp) | |
# For some reason the folliwng seems to be necessary sometimes for code_eval to work nice with multiprocessing | |
# https://stackoverflow.com/questions/60804599/python-multiprocessing-keeps-spawning-the-whole-script | |
if __name__ == "__main__": | |
main() | |