import fnmatch import torch from dataclasses import dataclass, replace from bigcode_eval.tasks import ALL_TASKS from bigcode_eval.evaluator import Evaluator from dmx.compressor import config_rules from dmx.compressor.modeling import DmxModel from transformers import ( AutoModelForCausalLM, AutoTokenizer ) import traceback @dataclass class BigcodeEvalArguments: prefix: str = "" do_sample: bool = True temperature: float = 0.8 top_k: int = 0 top_p: float = 0.95 n_samples: int = 10 eos: str = "<|endoftext|>" seed: int = 0 modeltype: str = "causal" instruction_tokens: str = None batch_size: int = 2 max_length_generation: int = 1024 limit: int = None limit_start: int = 0 metric_output_path: str = "evaluation_results.json" save_every_k_tasks: int = -1 postprocess: bool = True allow_code_execution: bool = True generation_only: bool = False load_generations_path: str = None load_data_path: str = None save_generations: bool = False load_generations_intermediate_paths: str = None save_generations_path: str = "generations.json" save_references: bool = False save_references_path: str = "references.json" prompt: str = "prompt" max_memory_per_gpu: str = None check_references: bool = False def code_eval(model, tokenizer, task, dmx_config, args=None, accelerator=None): """ Run code evaluation on the provided task using the specified model and tokenizer. Args: model: The model to use for evaluation. tokenizer: The tokenizer to use for evaluation. task: The task to evaluate. accelerator: Optional Accelerator instance. args: Optional dictionary of arguments to override defaults in BigcodeEvalArguments. Returns: result: A dictionary containing metric and result. """ if accelerator is None: from accelerate import Accelerator accelerator = Accelerator() # Initialize evaluation arguments eval_args = BigcodeEvalArguments() if args is not None: eval_args = replace(eval_args, **args) # Validate task if not fnmatch.filter(ALL_TASKS, task): raise ValueError(f"Invalid task: {task}") # Set up model if dmx_config is not None: model = DmxModel.from_torch(model).to("cuda") tensor = torch.randint(1, 100, (1, eval_args.max_length_generation)).to("cuda") model.transform(model.dmx_config, *eval(f"config_rules.{dmx_config}")) setup = model(tensor) else: model = model.to("cuda") tensor = torch.randint(1, 100, (1, eval_args.max_length_generation)).to("cuda") setup = model(tensor) # Set up tokenizer if not tokenizer.eos_token: if tokenizer.bos_token: tokenizer.eos_token = tokenizer.bos_token print("bos_token used as eos_token") else: raise ValueError("No eos_token or bos_token found") try: tokenizer.pad_token = tokenizer.eos_token except AttributeError: print("Not setting pad_token to eos_token") pass evaluator = Evaluator(accelerator, model, tokenizer, eval_args) try: unparsed_result = evaluator.evaluate(task) except Exception as e: print(f"Error evaluating task {task}: {e}") if eval_args.n_samples == 1: result = {task: {"pass@1": unparsed_result["pass@1"]}} elif eval_args.n_samples == 10: result = {task: {"pass@10": unparsed_result["pass@10"]}} else: result = {task: unparsed_result} return result def evaluate_model(model_repo_name, revision_name="main", dmx_config="BASELINE", task_name="humaneval", pass_k=1): model_kwargs = { "revision": revision_name, "trust_remote_code": True, } if pass_k == 10: eval_args = { "max_length_generation": 1024, "batch_size": 2, "n_samples": 10, "temperature": 0.8, "top_p": 0.95, } else: eval_args = { "max_length_generation": 1024, "batch_size": 1, "n_samples": 1, "do_sample": False, "temperature": None, "top_p": None, "top_k": None, } model = AutoModelForCausalLM.from_pretrained(model_repo_name, **model_kwargs) tokenizer = AutoTokenizer.from_pretrained( model_repo_name, **model_kwargs, padding_side="right", ) try: result = code_eval(model, tokenizer, task_name, dmx_config, args=eval_args) return result, None except Exception as e: error_message = f"Error during evaluation: {str(e)}\n\n{traceback.format_exc()}" print(error_message) return None, error_message