File size: 4,808 Bytes
944c19e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import fnmatch
import torch
from dataclasses import dataclass, replace
from bigcode_eval.tasks import ALL_TASKS
from bigcode_eval.evaluator import Evaluator
from dmx.compressor import config_rules
from dmx.compressor.modeling import DmxModel
from transformers import ( AutoModelForCausalLM, AutoTokenizer )
import traceback
@dataclass
class BigcodeEvalArguments:
prefix: str = ""
do_sample: bool = True
temperature: float = 0.8
top_k: int = 0
top_p: float = 0.95
n_samples: int = 10
eos: str = "<|endoftext|>"
seed: int = 0
modeltype: str = "causal"
instruction_tokens: str = None
batch_size: int = 2
max_length_generation: int = 1024
limit: int = None
limit_start: int = 0
metric_output_path: str = "evaluation_results.json"
save_every_k_tasks: int = -1
postprocess: bool = True
allow_code_execution: bool = True
generation_only: bool = False
load_generations_path: str = None
load_data_path: str = None
save_generations: bool = False
load_generations_intermediate_paths: str = None
save_generations_path: str = "generations.json"
save_references: bool = False
save_references_path: str = "references.json"
prompt: str = "prompt"
max_memory_per_gpu: str = None
check_references: bool = False
def code_eval(model, tokenizer, task, dmx_config, args=None, accelerator=None):
"""
Run code evaluation on the provided task using the specified model and tokenizer.
Args:
model: The model to use for evaluation.
tokenizer: The tokenizer to use for evaluation.
task: The task to evaluate.
accelerator: Optional Accelerator instance.
args: Optional dictionary of arguments to override defaults in BigcodeEvalArguments.
Returns:
result: A dictionary containing metric and result.
"""
if accelerator is None:
from accelerate import Accelerator
accelerator = Accelerator()
# Initialize evaluation arguments
eval_args = BigcodeEvalArguments()
if args is not None:
eval_args = replace(eval_args, **args)
# Validate task
if not fnmatch.filter(ALL_TASKS, task):
raise ValueError(f"Invalid task: {task}")
# Set up model
if dmx_config is not None:
model = DmxModel.from_torch(model).to("cuda")
tensor = torch.randint(1, 100, (1, eval_args.max_length_generation)).to("cuda")
model.transform(model.dmx_config, *eval(f"config_rules.{dmx_config}"))
setup = model(tensor)
else:
model = model.to("cuda")
tensor = torch.randint(1, 100, (1, eval_args.max_length_generation)).to("cuda")
setup = model(tensor)
# Set up tokenizer
if not tokenizer.eos_token:
if tokenizer.bos_token:
tokenizer.eos_token = tokenizer.bos_token
print("bos_token used as eos_token")
else:
raise ValueError("No eos_token or bos_token found")
try:
tokenizer.pad_token = tokenizer.eos_token
except AttributeError:
print("Not setting pad_token to eos_token")
pass
evaluator = Evaluator(accelerator, model, tokenizer, eval_args)
try:
unparsed_result = evaluator.evaluate(task)
except Exception as e:
print(f"Error evaluating task {task}: {e}")
if eval_args.n_samples == 1:
result = {task: {"pass@1": unparsed_result["pass@1"]}}
elif eval_args.n_samples == 10:
result = {task: {"pass@10": unparsed_result["pass@10"]}}
else:
result = {task: unparsed_result}
return result
def evaluate_model(model_repo_name, revision_name="main", dmx_config="BASELINE", task_name="humaneval", pass_k=1):
model_kwargs = {
"revision": revision_name,
"trust_remote_code": True,
}
if pass_k == 10:
eval_args = {
"max_length_generation": 1024,
"batch_size": 2,
"n_samples": 10,
"temperature": 0.8,
"top_p": 0.95,
}
else:
eval_args = {
"max_length_generation": 1024,
"batch_size": 1,
"n_samples": 1,
"do_sample": False,
"temperature": None,
"top_p": None,
"top_k": None,
}
model = AutoModelForCausalLM.from_pretrained(model_repo_name, **model_kwargs)
tokenizer = AutoTokenizer.from_pretrained(
model_repo_name,
**model_kwargs,
padding_side="right",
)
try:
result = code_eval(model, tokenizer, task_name, dmx_config, args=eval_args)
return result, None
except Exception as e:
error_message = f"Error during evaluation: {str(e)}\n\n{traceback.format_exc()}"
print(error_message)
return None, error_message
|