|
import fnmatch |
|
import torch |
|
from dataclasses import dataclass, replace |
|
from bigcode_eval.tasks import ALL_TASKS |
|
from bigcode_eval.evaluator import Evaluator |
|
from dmx.compressor import config_rules |
|
from dmx.compressor.modeling import DmxModel |
|
from transformers import ( AutoModelForCausalLM, AutoTokenizer ) |
|
import traceback |
|
|
|
@dataclass |
|
class BigcodeEvalArguments: |
|
prefix: str = "" |
|
do_sample: bool = True |
|
temperature: float = 0.8 |
|
top_k: int = 0 |
|
top_p: float = 0.95 |
|
n_samples: int = 10 |
|
eos: str = "<|endoftext|>" |
|
seed: int = 0 |
|
modeltype: str = "causal" |
|
instruction_tokens: str = None |
|
batch_size: int = 2 |
|
max_length_generation: int = 1024 |
|
limit: int = None |
|
limit_start: int = 0 |
|
metric_output_path: str = "evaluation_results.json" |
|
save_every_k_tasks: int = -1 |
|
postprocess: bool = True |
|
allow_code_execution: bool = True |
|
generation_only: bool = False |
|
load_generations_path: str = None |
|
load_data_path: str = None |
|
save_generations: bool = False |
|
load_generations_intermediate_paths: str = None |
|
save_generations_path: str = "generations.json" |
|
save_references: bool = False |
|
save_references_path: str = "references.json" |
|
prompt: str = "prompt" |
|
max_memory_per_gpu: str = None |
|
check_references: bool = False |
|
|
|
def code_eval(model, tokenizer, task, dmx_config, args=None, accelerator=None): |
|
""" |
|
Run code evaluation on the provided task using the specified model and tokenizer. |
|
|
|
Args: |
|
model: The model to use for evaluation. |
|
tokenizer: The tokenizer to use for evaluation. |
|
task: The task to evaluate. |
|
accelerator: Optional Accelerator instance. |
|
args: Optional dictionary of arguments to override defaults in BigcodeEvalArguments. |
|
|
|
Returns: |
|
result: A dictionary containing metric and result. |
|
""" |
|
|
|
if accelerator is None: |
|
from accelerate import Accelerator |
|
accelerator = Accelerator() |
|
|
|
|
|
eval_args = BigcodeEvalArguments() |
|
if args is not None: |
|
eval_args = replace(eval_args, **args) |
|
|
|
|
|
if not fnmatch.filter(ALL_TASKS, task): |
|
raise ValueError(f"Invalid task: {task}") |
|
|
|
|
|
if dmx_config is not None: |
|
model = DmxModel.from_torch(model).to("cuda") |
|
tensor = torch.randint(1, 100, (1, eval_args.max_length_generation)).to("cuda") |
|
model.transform(model.dmx_config, *eval(f"config_rules.{dmx_config}")) |
|
setup = model(tensor) |
|
else: |
|
model = model.to("cuda") |
|
tensor = torch.randint(1, 100, (1, eval_args.max_length_generation)).to("cuda") |
|
setup = model(tensor) |
|
|
|
|
|
if not tokenizer.eos_token: |
|
if tokenizer.bos_token: |
|
tokenizer.eos_token = tokenizer.bos_token |
|
print("bos_token used as eos_token") |
|
else: |
|
raise ValueError("No eos_token or bos_token found") |
|
try: |
|
tokenizer.pad_token = tokenizer.eos_token |
|
except AttributeError: |
|
print("Not setting pad_token to eos_token") |
|
pass |
|
|
|
evaluator = Evaluator(accelerator, model, tokenizer, eval_args) |
|
|
|
try: |
|
unparsed_result = evaluator.evaluate(task) |
|
except Exception as e: |
|
print(f"Error evaluating task {task}: {e}") |
|
|
|
if eval_args.n_samples == 1: |
|
result = {task: {"pass@1": unparsed_result["pass@1"]}} |
|
elif eval_args.n_samples == 10: |
|
result = {task: {"pass@10": unparsed_result["pass@10"]}} |
|
else: |
|
result = {task: unparsed_result} |
|
|
|
return result |
|
|
|
def evaluate_model(model_repo_name, revision_name="main", dmx_config="BASELINE", task_name="humaneval", pass_k=1): |
|
model_kwargs = { |
|
"revision": revision_name, |
|
"trust_remote_code": True, |
|
} |
|
|
|
if pass_k == 10: |
|
eval_args = { |
|
"max_length_generation": 1024, |
|
"batch_size": 2, |
|
"n_samples": 10, |
|
"temperature": 0.8, |
|
"top_p": 0.95, |
|
} |
|
else: |
|
eval_args = { |
|
"max_length_generation": 1024, |
|
"batch_size": 1, |
|
"n_samples": 1, |
|
"do_sample": False, |
|
"temperature": None, |
|
"top_p": None, |
|
"top_k": None, |
|
} |
|
|
|
model = AutoModelForCausalLM.from_pretrained(model_repo_name, **model_kwargs) |
|
tokenizer = AutoTokenizer.from_pretrained( |
|
model_repo_name, |
|
**model_kwargs, |
|
padding_side="right", |
|
) |
|
|
|
try: |
|
result = code_eval(model, tokenizer, task_name, dmx_config, args=eval_args) |
|
return result, None |
|
except Exception as e: |
|
error_message = f"Error during evaluation: {str(e)}\n\n{traceback.format_exc()}" |
|
print(error_message) |
|
return None, error_message |
|
|