|
import argparse |
|
import json |
|
import os |
|
import re |
|
from pathlib import Path |
|
|
|
import torch |
|
from tqdm import tqdm |
|
|
|
data_abs_dir = Path(__file__).parent / "data" |
|
|
|
from human_eval.evaluation import evaluate_functional_correctness |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
|
|
|
def read_test_examples(data_path: str): |
|
def format_test_example(q, tests, code: str = None): |
|
prompt = ">>> Problem:\n{}\n>>> Test Cases:\n{}\n".format( |
|
q.strip(), "\n".join(tests) |
|
) |
|
if code: |
|
code = code.replace("\r", "").replace("\t", " ") |
|
prompt += "\n>>> Code:\n```python\n{}\n```".format(code) |
|
return prompt |
|
|
|
examples = [json.loads(x) for x in open(data_path)] |
|
print("Read all {} examples from {} over!".format(len(examples), data_path)) |
|
|
|
|
|
examples_str = [] |
|
for i in range(1, 4): |
|
ex = examples[i] |
|
q, test, code = ex["text"], ex["test_list"], ex["code"] |
|
ex_prompt = format_test_example(q, test, code) |
|
example_prompt = "- Example {}:\n{}".format(i, ex_prompt) |
|
examples_str += [example_prompt] |
|
|
|
for i in range(10, 510): |
|
ex = examples[i] |
|
q, test, code = ex["text"], ex["test_list"], ex["code"] |
|
|
|
prompt = format_test_example(q, test, code=None) |
|
|
|
prompt_with_shots = """ |
|
Please refer the given examples and generate a python function for my problem. |
|
Examples are listed as follows: |
|
{} |
|
|
|
Here is my problem: |
|
{} |
|
""".strip().format( |
|
"\n\n".join(examples_str), prompt |
|
) |
|
yield {"task_id": ex["task_id"], "prompt": prompt_with_shots} |
|
|
|
|
|
def convert_for_evaluation(example): |
|
gpt_completion = example["gpt_completion"] |
|
generation = gpt_completion |
|
try: |
|
code_block: str = re.findall( |
|
f"```python\n(.*?)```", gpt_completion, re.DOTALL | re.IGNORECASE |
|
)[0] |
|
generation = code_block |
|
except Exception as ex: |
|
print("Failed to extract codeblock:\n{}".format(gpt_completion)) |
|
|
|
example["generation"] = generation |
|
return example |
|
|
|
|
|
def generate_one(example, tokenizer, model): |
|
prompt = example["prompt"] |
|
inputs = tokenizer.apply_chat_template( |
|
[{"role": "user", "content": prompt}], |
|
return_tensors="pt", |
|
add_generation_prompt=True, |
|
).to(model.device) |
|
|
|
|
|
|
|
stop_id = tokenizer.eos_token_id |
|
outputs = model.generate( |
|
inputs, |
|
max_new_tokens=512, |
|
do_sample=False, |
|
|
|
|
|
pad_token_id=stop_id, |
|
eos_token_id=stop_id, |
|
) |
|
|
|
output = tokenizer.decode(outputs[0][len(inputs[0]) :], skip_special_tokens=True) |
|
|
|
example["gpt_completion"] = output |
|
return convert_for_evaluation(example) |
|
|
|
|
|
def generate_main(args): |
|
model_name_or_path = args.model |
|
saved_path = args.output_path |
|
temp_dir = args.temp_dir |
|
os.makedirs(temp_dir, exist_ok=True) |
|
problem_file = os.path.join(data_abs_dir, f"mbpp.jsonl") |
|
|
|
print("model", model_name_or_path) |
|
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) |
|
print( |
|
"load tokenizer {} from {} over.".format( |
|
tokenizer.__class__, model_name_or_path |
|
) |
|
) |
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_name_or_path, |
|
torch_dtype=torch.bfloat16, |
|
device_map="auto", |
|
) |
|
model.eval() |
|
|
|
examples = list(read_test_examples(problem_file)) |
|
print("Read {} examples for evaluation over.".format(len(examples))) |
|
|
|
generated_examples = [] |
|
for ex in tqdm(examples, desc="Generating"): |
|
gen_example = generate_one(ex, tokenizer, model) |
|
generated_examples.append(gen_example) |
|
print("Generate {}/{} over...".format(len(generated_examples), len(examples))) |
|
|
|
print("Generate all over!!!") |
|
with open(saved_path, "w", encoding="utf-8") as fw: |
|
for ex in generated_examples: |
|
fw.write(json.dumps(ex) + "\n") |
|
print( |
|
"Save {} processed examples into {} over!".format( |
|
len(generated_examples), saved_path |
|
) |
|
) |
|
|
|
result = evaluate_functional_correctness( |
|
input_file=saved_path, |
|
tmp_dir=temp_dir, |
|
problem_file=os.path.join(data_abs_dir, f"mbpp_test.jsonl"), |
|
language="python", |
|
is_mbpp=True, |
|
) |
|
print(result, model_name_or_path) |
|
pass |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument( |
|
"--model", |
|
type=str, |
|
help="model name or path", |
|
default="/data0/pretrained-models/Qwen2-7B-Instruct", |
|
) |
|
parser.add_argument( |
|
"--output_path", |
|
type=str, |
|
help="output path of your generation", |
|
default="/home/qyhuang/DeepSeek-Coder/outputs/qwen2-mbpp.json", |
|
) |
|
parser.add_argument( |
|
"--temp_dir", type=str, help="temp dir for evaluation", default="tmp" |
|
) |
|
args = parser.parse_args() |
|
|
|
os.environ["TOKENIZERS_PARALLELISM"] = "false" |
|
generate_main(args) |
|
pass |
|
|