File size: 4,808 Bytes
944c19e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import fnmatch
import torch
from dataclasses import dataclass, replace
from bigcode_eval.tasks import ALL_TASKS
from bigcode_eval.evaluator import Evaluator
from dmx.compressor import config_rules
from dmx.compressor.modeling import DmxModel
from transformers import ( AutoModelForCausalLM, AutoTokenizer )
import traceback

@dataclass
class BigcodeEvalArguments:
    prefix: str = ""
    do_sample: bool = True
    temperature: float = 0.8
    top_k: int = 0
    top_p: float = 0.95
    n_samples: int = 10
    eos: str = "<|endoftext|>"
    seed: int = 0
    modeltype: str = "causal"
    instruction_tokens: str = None
    batch_size: int = 2
    max_length_generation: int = 1024
    limit: int = None
    limit_start: int = 0
    metric_output_path: str = "evaluation_results.json"
    save_every_k_tasks: int = -1
    postprocess: bool = True
    allow_code_execution: bool = True
    generation_only: bool = False
    load_generations_path: str = None
    load_data_path: str = None
    save_generations: bool = False
    load_generations_intermediate_paths: str = None
    save_generations_path: str = "generations.json"
    save_references: bool = False
    save_references_path: str = "references.json"
    prompt: str = "prompt"
    max_memory_per_gpu: str = None
    check_references: bool = False

def code_eval(model, tokenizer, task, dmx_config, args=None, accelerator=None):
    """
    Run code evaluation on the provided task using the specified model and tokenizer.

    Args:
        model: The model to use for evaluation.
        tokenizer: The tokenizer to use for evaluation.
        task: The task to evaluate.
        accelerator: Optional Accelerator instance.
        args: Optional dictionary of arguments to override defaults in BigcodeEvalArguments.

    Returns:
        result: A dictionary containing metric and result.
    """
    
    if accelerator is None:
        from accelerate import Accelerator
        accelerator = Accelerator()

    # Initialize evaluation arguments
    eval_args = BigcodeEvalArguments()
    if args is not None:
        eval_args = replace(eval_args, **args)

    # Validate task
    if not fnmatch.filter(ALL_TASKS, task):
        raise ValueError(f"Invalid task: {task}")

    # Set up model
    if dmx_config is not None:
        model = DmxModel.from_torch(model).to("cuda")
        tensor = torch.randint(1, 100, (1, eval_args.max_length_generation)).to("cuda")
        model.transform(model.dmx_config, *eval(f"config_rules.{dmx_config}"))
        setup = model(tensor)
    else:
        model = model.to("cuda")
        tensor = torch.randint(1, 100, (1, eval_args.max_length_generation)).to("cuda")
        setup = model(tensor)

    # Set up tokenizer
    if not tokenizer.eos_token:
        if tokenizer.bos_token:
            tokenizer.eos_token = tokenizer.bos_token
            print("bos_token used as eos_token")
        else:
            raise ValueError("No eos_token or bos_token found")
    try:
        tokenizer.pad_token = tokenizer.eos_token
    except AttributeError:
        print("Not setting pad_token to eos_token")
        pass

    evaluator = Evaluator(accelerator, model, tokenizer, eval_args)

    try:
        unparsed_result = evaluator.evaluate(task)
    except Exception as e:
        print(f"Error evaluating task {task}: {e}")

    if eval_args.n_samples == 1:
        result = {task: {"pass@1": unparsed_result["pass@1"]}}
    elif eval_args.n_samples == 10:
        result = {task: {"pass@10": unparsed_result["pass@10"]}}
    else:
        result = {task: unparsed_result}

    return result

def evaluate_model(model_repo_name, revision_name="main", dmx_config="BASELINE", task_name="humaneval", pass_k=1):
    model_kwargs = {
        "revision": revision_name,
        "trust_remote_code": True,
    }

    if pass_k == 10:
        eval_args = {
            "max_length_generation": 1024,
            "batch_size": 2,
            "n_samples": 10,
            "temperature": 0.8,
            "top_p": 0.95,
        }
    else:
        eval_args = {
            "max_length_generation": 1024,
            "batch_size": 1,
            "n_samples": 1,
            "do_sample": False,
            "temperature": None,
            "top_p": None,
            "top_k": None,
        }
    
    model = AutoModelForCausalLM.from_pretrained(model_repo_name, **model_kwargs)
    tokenizer = AutoTokenizer.from_pretrained(
        model_repo_name,
        **model_kwargs,
        padding_side="right",
    )

    try:
        result = code_eval(model, tokenizer, task_name, dmx_config, args=eval_args)
        return result, None
    except Exception as e:
        error_message = f"Error during evaluation: {str(e)}\n\n{traceback.format_exc()}"
        print(error_message)
        return None, error_message