bmah-dmx commited on
Commit
944c19e
·
verified ·
1 Parent(s): c7bd6ea

Added code_eval.py for convenient evaluation with bigcode-evaluation-harness

Browse files
Files changed (1) hide show
  1. code_eval.py +149 -0
code_eval.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fnmatch
2
+ import torch
3
+ from dataclasses import dataclass, replace
4
+ from bigcode_eval.tasks import ALL_TASKS
5
+ from bigcode_eval.evaluator import Evaluator
6
+ from dmx.compressor import config_rules
7
+ from dmx.compressor.modeling import DmxModel
8
+ from transformers import ( AutoModelForCausalLM, AutoTokenizer )
9
+ import traceback
10
+
11
+ @dataclass
12
+ class BigcodeEvalArguments:
13
+ prefix: str = ""
14
+ do_sample: bool = True
15
+ temperature: float = 0.8
16
+ top_k: int = 0
17
+ top_p: float = 0.95
18
+ n_samples: int = 10
19
+ eos: str = "<|endoftext|>"
20
+ seed: int = 0
21
+ modeltype: str = "causal"
22
+ instruction_tokens: str = None
23
+ batch_size: int = 2
24
+ max_length_generation: int = 1024
25
+ limit: int = None
26
+ limit_start: int = 0
27
+ metric_output_path: str = "evaluation_results.json"
28
+ save_every_k_tasks: int = -1
29
+ postprocess: bool = True
30
+ allow_code_execution: bool = True
31
+ generation_only: bool = False
32
+ load_generations_path: str = None
33
+ load_data_path: str = None
34
+ save_generations: bool = False
35
+ load_generations_intermediate_paths: str = None
36
+ save_generations_path: str = "generations.json"
37
+ save_references: bool = False
38
+ save_references_path: str = "references.json"
39
+ prompt: str = "prompt"
40
+ max_memory_per_gpu: str = None
41
+ check_references: bool = False
42
+
43
+ def code_eval(model, tokenizer, task, dmx_config, args=None, accelerator=None):
44
+ """
45
+ Run code evaluation on the provided task using the specified model and tokenizer.
46
+
47
+ Args:
48
+ model: The model to use for evaluation.
49
+ tokenizer: The tokenizer to use for evaluation.
50
+ task: The task to evaluate.
51
+ accelerator: Optional Accelerator instance.
52
+ args: Optional dictionary of arguments to override defaults in BigcodeEvalArguments.
53
+
54
+ Returns:
55
+ result: A dictionary containing metric and result.
56
+ """
57
+
58
+ if accelerator is None:
59
+ from accelerate import Accelerator
60
+ accelerator = Accelerator()
61
+
62
+ # Initialize evaluation arguments
63
+ eval_args = BigcodeEvalArguments()
64
+ if args is not None:
65
+ eval_args = replace(eval_args, **args)
66
+
67
+ # Validate task
68
+ if not fnmatch.filter(ALL_TASKS, task):
69
+ raise ValueError(f"Invalid task: {task}")
70
+
71
+ # Set up model
72
+ if dmx_config is not None:
73
+ model = DmxModel.from_torch(model).to("cuda")
74
+ tensor = torch.randint(1, 100, (1, eval_args.max_length_generation)).to("cuda")
75
+ model.transform(model.dmx_config, *eval(f"config_rules.{dmx_config}"))
76
+ setup = model(tensor)
77
+ else:
78
+ model = model.to("cuda")
79
+ tensor = torch.randint(1, 100, (1, eval_args.max_length_generation)).to("cuda")
80
+ setup = model(tensor)
81
+
82
+ # Set up tokenizer
83
+ if not tokenizer.eos_token:
84
+ if tokenizer.bos_token:
85
+ tokenizer.eos_token = tokenizer.bos_token
86
+ print("bos_token used as eos_token")
87
+ else:
88
+ raise ValueError("No eos_token or bos_token found")
89
+ try:
90
+ tokenizer.pad_token = tokenizer.eos_token
91
+ except AttributeError:
92
+ print("Not setting pad_token to eos_token")
93
+ pass
94
+
95
+ evaluator = Evaluator(accelerator, model, tokenizer, eval_args)
96
+
97
+ try:
98
+ unparsed_result = evaluator.evaluate(task)
99
+ except Exception as e:
100
+ print(f"Error evaluating task {task}: {e}")
101
+
102
+ if eval_args.n_samples == 1:
103
+ result = {task: {"pass@1": unparsed_result["pass@1"]}}
104
+ elif eval_args.n_samples == 10:
105
+ result = {task: {"pass@10": unparsed_result["pass@10"]}}
106
+ else:
107
+ result = {task: unparsed_result}
108
+
109
+ return result
110
+
111
+ def evaluate_model(model_repo_name, revision_name="main", dmx_config="BASELINE", task_name="humaneval", pass_k=1):
112
+ model_kwargs = {
113
+ "revision": revision_name,
114
+ "trust_remote_code": True,
115
+ }
116
+
117
+ if pass_k == 10:
118
+ eval_args = {
119
+ "max_length_generation": 1024,
120
+ "batch_size": 2,
121
+ "n_samples": 10,
122
+ "temperature": 0.8,
123
+ "top_p": 0.95,
124
+ }
125
+ else:
126
+ eval_args = {
127
+ "max_length_generation": 1024,
128
+ "batch_size": 1,
129
+ "n_samples": 1,
130
+ "do_sample": False,
131
+ "temperature": None,
132
+ "top_p": None,
133
+ "top_k": None,
134
+ }
135
+
136
+ model = AutoModelForCausalLM.from_pretrained(model_repo_name, **model_kwargs)
137
+ tokenizer = AutoTokenizer.from_pretrained(
138
+ model_repo_name,
139
+ **model_kwargs,
140
+ padding_side="right",
141
+ )
142
+
143
+ try:
144
+ result = code_eval(model, tokenizer, task_name, dmx_config, args=eval_args)
145
+ return result, None
146
+ except Exception as e:
147
+ error_message = f"Error during evaluation: {str(e)}\n\n{traceback.format_exc()}"
148
+ print(error_message)
149
+ return None, error_message