qianxiao1111's picture
upgrade: add benchmarks eval
2a26d3b
raw
history blame
5.19 kB
import argparse
import json
import os
import re
from pathlib import Path
import torch
from tqdm import tqdm
data_abs_dir = Path(__file__).parent / "data"
from human_eval.evaluation import evaluate_functional_correctness
from transformers import AutoModelForCausalLM, AutoTokenizer
def read_test_examples(data_path: str):
def format_test_example(q, tests, code: str = None):
prompt = ">>> Problem:\n{}\n>>> Test Cases:\n{}\n".format(
q.strip(), "\n".join(tests)
)
if code:
code = code.replace("\r", "").replace("\t", " ")
prompt += "\n>>> Code:\n```python\n{}\n```".format(code)
return prompt
examples = [json.loads(x) for x in open(data_path)]
print("Read all {} examples from {} over!".format(len(examples), data_path))
# test_cases
examples_str = []
for i in range(1, 4):
ex = examples[i]
q, test, code = ex["text"], ex["test_list"], ex["code"]
ex_prompt = format_test_example(q, test, code)
example_prompt = "- Example {}:\n{}".format(i, ex_prompt)
examples_str += [example_prompt]
for i in range(10, 510):
ex = examples[i]
q, test, code = ex["text"], ex["test_list"], ex["code"]
prompt = format_test_example(q, test, code=None)
prompt_with_shots = """
Please refer the given examples and generate a python function for my problem.
Examples are listed as follows:
{}
Here is my problem:
{}
""".strip().format(
"\n\n".join(examples_str), prompt
)
yield {"task_id": ex["task_id"], "prompt": prompt_with_shots}
def convert_for_evaluation(example):
gpt_completion = example["gpt_completion"]
generation = gpt_completion
try:
code_block: str = re.findall(
f"```python\n(.*?)```", gpt_completion, re.DOTALL | re.IGNORECASE
)[0]
generation = code_block
except Exception as ex:
print("Failed to extract codeblock:\n{}".format(gpt_completion))
example["generation"] = generation
return example
def generate_one(example, tokenizer, model):
prompt = example["prompt"]
inputs = tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}],
return_tensors="pt",
add_generation_prompt=True,
).to(model.device)
#
# stop_id = tokenizer.convert_tokens_to_ids("<|EOT|>")
# assert isinstance(stop_id, int), "Invalid tokenizer, EOT id not found"
stop_id = tokenizer.eos_token_id
outputs = model.generate(
inputs,
max_new_tokens=512,
do_sample=False,
# top_p=0.95,
# temperature=temperature,
pad_token_id=stop_id,
eos_token_id=stop_id,
)
output = tokenizer.decode(outputs[0][len(inputs[0]) :], skip_special_tokens=True)
# print(output)
example["gpt_completion"] = output
return convert_for_evaluation(example)
def generate_main(args):
model_name_or_path = args.model
saved_path = args.output_path
temp_dir = args.temp_dir
os.makedirs(temp_dir, exist_ok=True)
problem_file = os.path.join(data_abs_dir, f"mbpp.jsonl")
print("model", model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
print(
"load tokenizer {} from {} over.".format(
tokenizer.__class__, model_name_or_path
)
)
model = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
torch_dtype=torch.bfloat16,
device_map="auto",
)
model.eval()
examples = list(read_test_examples(problem_file))
print("Read {} examples for evaluation over.".format(len(examples)))
generated_examples = []
for ex in tqdm(examples, desc="Generating"):
gen_example = generate_one(ex, tokenizer, model)
generated_examples.append(gen_example)
print("Generate {}/{} over...".format(len(generated_examples), len(examples)))
print("Generate all over!!!")
with open(saved_path, "w", encoding="utf-8") as fw:
for ex in generated_examples:
fw.write(json.dumps(ex) + "\n")
print(
"Save {} processed examples into {} over!".format(
len(generated_examples), saved_path
)
)
result = evaluate_functional_correctness(
input_file=saved_path,
tmp_dir=temp_dir,
problem_file=os.path.join(data_abs_dir, f"mbpp_test.jsonl"),
language="python",
is_mbpp=True,
)
print(result, model_name_or_path)
pass
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model",
type=str,
help="model name or path",
default="/data0/pretrained-models/Qwen2-7B-Instruct",
)
parser.add_argument(
"--output_path",
type=str,
help="output path of your generation",
default="/home/qyhuang/DeepSeek-Coder/outputs/qwen2-mbpp.json",
)
parser.add_argument(
"--temp_dir", type=str, help="temp dir for evaluation", default="tmp"
)
args = parser.parse_args()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
generate_main(args)
pass