khulnasoft commited on
Commit
fd26ab1
·
verified ·
1 Parent(s): faf6836

Create lighteval_tasks.py

Browse files
Files changed (1) hide show
  1. lighteval_tasks.py +251 -0
lighteval_tasks.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import List, Tuple
3
+
4
+ from lighteval.metrics import Metrics
5
+ from lighteval.tasks.lighteval_task import LightevalTaskConfig
6
+ from lighteval.tasks.requests import Doc
7
+ from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
8
+
9
+ _TASKS_STRINGS: List[Tuple[LightevalTaskConfig, str]] = []
10
+ _TASKS: List[LightevalTaskConfig] = []
11
+
12
+ ## COMMON_SENSE_REASONING_TASKS ##
13
+ COMMON_SENSE_REASONING_TASKS = [
14
+ LightevalTaskConfig(
15
+ name="hellaswag",
16
+ prompt_function="hellaswag_prompt",
17
+ hf_repo="hellaswag",
18
+ hf_subset="default",
19
+ metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
20
+ ),
21
+ LightevalTaskConfig(
22
+ name="winogrande",
23
+ prompt_function="winogrande",
24
+ hf_repo="winogrande",
25
+ hf_subset="winogrande_xl",
26
+ metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
27
+ ),
28
+ LightevalTaskConfig(
29
+ name="piqa",
30
+ prompt_function="piqa_harness",
31
+ hf_repo="piqa",
32
+ hf_subset="plain_text",
33
+ metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
34
+ ),
35
+ LightevalTaskConfig(
36
+ name="siqa",
37
+ prompt_function="siqa_prompt",
38
+ hf_repo="lighteval/siqa",
39
+ hf_subset="default",
40
+ hf_avail_splits=["train", "validation"],
41
+ metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
42
+ ),
43
+ LightevalTaskConfig(
44
+ name="openbookqa",
45
+ prompt_function="openbookqa",
46
+ hf_repo="openbookqa",
47
+ hf_subset="main",
48
+ metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
49
+ ),
50
+ LightevalTaskConfig(
51
+ name="arc:easy",
52
+ prompt_function="arc",
53
+ hf_repo="ai2_arc",
54
+ hf_subset="ARC-Easy",
55
+ evaluation_splits=["test"],
56
+ generation_size=1,
57
+ metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
58
+ ),
59
+ LightevalTaskConfig(
60
+ name="arc:challenge",
61
+ prompt_function="arc",
62
+ hf_repo="ai2_arc",
63
+ hf_subset="ARC-Challenge",
64
+ evaluation_splits=["test"],
65
+ generation_size=1,
66
+ metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
67
+ ),
68
+ LightevalTaskConfig(
69
+ name="commonsense_qa",
70
+ prompt_function="commonsense_qa_prompt",
71
+ hf_repo="commonsense_qa",
72
+ hf_subset="default",
73
+ metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
74
+ ),
75
+ ]
76
+
77
+
78
+ def commonsense_qa_prompt(line, task_name: str = None):
79
+ return Doc(
80
+ task_name=task_name,
81
+ query=line["question"],
82
+ choices=[f" {c}" for c in line["choices"]["text"]],
83
+ gold_index=LETTER_INDICES.index(line["answerKey"].strip()),
84
+ instruction="",
85
+ )
86
+
87
+
88
+ def siqa_prompt(line, task_name: str = None):
89
+ return Doc(
90
+ task_name=task_name,
91
+ query=line["context"] + " " + line["question"],
92
+ choices=[f" {c}" for c in [line["answerA"], line["answerB"], line["answerC"]]],
93
+ gold_index=int(line["label"]) - 1,
94
+ instruction="",
95
+ )
96
+
97
+
98
+ def hellaswag_prompt(line, task_name: str = None):
99
+ def preprocess(text):
100
+ """Comes from AiHarness"""
101
+ # text = text.strip()
102
+ # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
103
+ text = text.replace(" [title]", ". ")
104
+ text = re.sub("\\[.*?\\]", "", text)
105
+ text = text.replace(" ", " ")
106
+ return text
107
+
108
+ ctx = f"{line['ctx_a']} {line['ctx_b'].capitalize()} "
109
+ return Doc(
110
+ task_name=task_name,
111
+ query=preprocess(line["activity_label"] + ": " + ctx),
112
+ choices=[" " + preprocess(ending) for ending in line["endings"]],
113
+ gold_index=int(line["label"]) if line["label"] != "" else -1, # -1 for test
114
+ # "metric": "choices_loglikelihood",
115
+ )
116
+
117
+
118
+ # 0 short for common sense
119
+ COMMON_SENSE_REASONING_STRING = [(t, f"custom|{t.name}|0|1") for t in COMMON_SENSE_REASONING_TASKS]
120
+ _TASKS_STRINGS.extend(COMMON_SENSE_REASONING_STRING)
121
+ _TASKS += COMMON_SENSE_REASONING_TASKS
122
+
123
+ ## MMLU ##
124
+ class CustomMMLUEvaluationTask(LightevalTaskConfig):
125
+ def __init__(
126
+ self,
127
+ name,
128
+ prompt_function="mmlu_prompt",
129
+ hf_repo="lighteval/mmlu",
130
+ hf_subset=None,
131
+ # metric=[Metrics.loglikelihood_acc_single_token],
132
+ metric=[Metrics.loglikelihood_acc, Metrics.loglikelihood_acc_norm_nospace],
133
+ hf_avail_splits=None,
134
+ evaluation_splits=["test"],
135
+ few_shots_split="dev",
136
+ few_shots_select=None,
137
+ suite=None,
138
+ generation_size=-1,
139
+ stop_sequence=None,
140
+ output_regex=None,
141
+ frozen=False,
142
+ ):
143
+ super().__init__(
144
+ name=name,
145
+ prompt_function=prompt_function,
146
+ hf_repo=hf_repo,
147
+ hf_subset=hf_subset,
148
+ metric=metric,
149
+ hf_avail_splits=hf_avail_splits,
150
+ evaluation_splits=evaluation_splits,
151
+ few_shots_split=few_shots_split,
152
+ few_shots_select=few_shots_select,
153
+ suite=suite,
154
+ generation_size=generation_size,
155
+ stop_sequence=stop_sequence,
156
+ output_regex=output_regex,
157
+ frozen=frozen,
158
+ )
159
+
160
+
161
+ MMLU_TASKS = [
162
+ CustomMMLUEvaluationTask(name="mmlu:abstract_algebra", hf_subset="abstract_algebra"),
163
+ CustomMMLUEvaluationTask(name="mmlu:anatomy", hf_subset="anatomy"),
164
+ CustomMMLUEvaluationTask(name="mmlu:astronomy", hf_subset="astronomy"),
165
+ CustomMMLUEvaluationTask(name="mmlu:business_ethics", hf_subset="business_ethics"),
166
+ CustomMMLUEvaluationTask(name="mmlu:clinical_knowledge", hf_subset="clinical_knowledge"),
167
+ CustomMMLUEvaluationTask(name="mmlu:college_biology", hf_subset="college_biology"),
168
+ CustomMMLUEvaluationTask(name="mmlu:college_chemistry", hf_subset="college_chemistry"),
169
+ CustomMMLUEvaluationTask(name="mmlu:college_computer_science", hf_subset="college_computer_science"),
170
+ CustomMMLUEvaluationTask(name="mmlu:college_mathematics", hf_subset="college_mathematics"),
171
+ CustomMMLUEvaluationTask(name="mmlu:college_medicine", hf_subset="college_medicine"),
172
+ CustomMMLUEvaluationTask(name="mmlu:college_physics", hf_subset="college_physics"),
173
+ CustomMMLUEvaluationTask(name="mmlu:computer_security", hf_subset="computer_security"),
174
+ CustomMMLUEvaluationTask(name="mmlu:conceptual_physics", hf_subset="conceptual_physics"),
175
+ CustomMMLUEvaluationTask(name="mmlu:econometrics", hf_subset="econometrics"),
176
+ CustomMMLUEvaluationTask(name="mmlu:electrical_engineering", hf_subset="electrical_engineering"),
177
+ CustomMMLUEvaluationTask(name="mmlu:elementary_mathematics", hf_subset="elementary_mathematics"),
178
+ CustomMMLUEvaluationTask(name="mmlu:formal_logic", hf_subset="formal_logic"),
179
+ CustomMMLUEvaluationTask(name="mmlu:global_facts", hf_subset="global_facts"),
180
+ CustomMMLUEvaluationTask(name="mmlu:high_school_biology", hf_subset="high_school_biology"),
181
+ CustomMMLUEvaluationTask(name="mmlu:high_school_chemistry", hf_subset="high_school_chemistry"),
182
+ CustomMMLUEvaluationTask(name="mmlu:high_school_computer_science", hf_subset="high_school_computer_science"),
183
+ CustomMMLUEvaluationTask(name="mmlu:high_school_european_history", hf_subset="high_school_european_history"),
184
+ CustomMMLUEvaluationTask(name="mmlu:high_school_geography", hf_subset="high_school_geography"),
185
+ CustomMMLUEvaluationTask(
186
+ name="mmlu:high_school_government_and_politics", hf_subset="high_school_government_and_politics"
187
+ ),
188
+ CustomMMLUEvaluationTask(name="mmlu:high_school_macroeconomics", hf_subset="high_school_macroeconomics"),
189
+ CustomMMLUEvaluationTask(name="mmlu:high_school_mathematics", hf_subset="high_school_mathematics"),
190
+ CustomMMLUEvaluationTask(name="mmlu:high_school_microeconomics", hf_subset="high_school_microeconomics"),
191
+ CustomMMLUEvaluationTask(name="mmlu:high_school_physics", hf_subset="high_school_physics"),
192
+ CustomMMLUEvaluationTask(name="mmlu:high_school_psychology", hf_subset="high_school_psychology"),
193
+ CustomMMLUEvaluationTask(name="mmlu:high_school_statistics", hf_subset="high_school_statistics"),
194
+ CustomMMLUEvaluationTask(name="mmlu:high_school_us_history", hf_subset="high_school_us_history"),
195
+ CustomMMLUEvaluationTask(name="mmlu:high_school_world_history", hf_subset="high_school_world_history"),
196
+ CustomMMLUEvaluationTask(name="mmlu:human_aging", hf_subset="human_aging"),
197
+ CustomMMLUEvaluationTask(name="mmlu:human_sexuality", hf_subset="human_sexuality"),
198
+ CustomMMLUEvaluationTask(name="mmlu:international_law", hf_subset="international_law"),
199
+ CustomMMLUEvaluationTask(name="mmlu:jurisprudence", hf_subset="jurisprudence"),
200
+ CustomMMLUEvaluationTask(name="mmlu:logical_fallacies", hf_subset="logical_fallacies"),
201
+ CustomMMLUEvaluationTask(name="mmlu:machine_learning", hf_subset="machine_learning"),
202
+ CustomMMLUEvaluationTask(name="mmlu:management", hf_subset="management"),
203
+ CustomMMLUEvaluationTask(name="mmlu:marketing", hf_subset="marketing"),
204
+ CustomMMLUEvaluationTask(name="mmlu:medical_genetics", hf_subset="medical_genetics"),
205
+ CustomMMLUEvaluationTask(name="mmlu:miscellaneous", hf_subset="miscellaneous"),
206
+ CustomMMLUEvaluationTask(name="mmlu:moral_disputes", hf_subset="moral_disputes"),
207
+ CustomMMLUEvaluationTask(name="mmlu:moral_scenarios", hf_subset="moral_scenarios"),
208
+ CustomMMLUEvaluationTask(name="mmlu:nutrition", hf_subset="nutrition"),
209
+ CustomMMLUEvaluationTask(name="mmlu:philosophy", hf_subset="philosophy"),
210
+ CustomMMLUEvaluationTask(name="mmlu:prehistory", hf_subset="prehistory"),
211
+ CustomMMLUEvaluationTask(name="mmlu:professional_accounting", hf_subset="professional_accounting"),
212
+ CustomMMLUEvaluationTask(name="mmlu:professional_law", hf_subset="professional_law"),
213
+ CustomMMLUEvaluationTask(name="mmlu:professional_medicine", hf_subset="professional_medicine"),
214
+ CustomMMLUEvaluationTask(name="mmlu:professional_psychology", hf_subset="professional_psychology"),
215
+ CustomMMLUEvaluationTask(name="mmlu:public_relations", hf_subset="public_relations"),
216
+ CustomMMLUEvaluationTask(name="mmlu:security_studies", hf_subset="security_studies"),
217
+ CustomMMLUEvaluationTask(name="mmlu:sociology", hf_subset="sociology"),
218
+ CustomMMLUEvaluationTask(name="mmlu:us_foreign_policy", hf_subset="us_foreign_policy"),
219
+ CustomMMLUEvaluationTask(name="mmlu:virology", hf_subset="virology"),
220
+ CustomMMLUEvaluationTask(name="mmlu:world_religions", hf_subset="world_religions"),
221
+ ]
222
+
223
+
224
+ def mmlu_prompt(line, task_name: str = None):
225
+ """MMLU prompt without letters"""
226
+ topic = line["subject"]
227
+ prompt = f"The following are questions about {topic.replace('_', ' ')}.\nQuestion: "
228
+ prompt += line["question"] + "\nAnswer:"
229
+
230
+ return Doc(
231
+ task_name=task_name,
232
+ query=prompt,
233
+ choices=[f" {c}" for c in line["choices"]],
234
+ gold_index=line["answer"],
235
+ instruction=f"The following are questions about {topic.replace('_', ' ')}.\n",
236
+ )
237
+
238
+
239
+ MMLU_STRING = [(t, f"custom|{t.name}|0|1") for t in MMLU_TASKS]
240
+ _TASKS_STRINGS.extend(MMLU_STRING)
241
+ _TASKS += MMLU_TASKS
242
+
243
+ # common sense reasoning + mmlu
244
+ EARLY_SIGNAL_TASKS = ",".join([t[1] for t in COMMON_SENSE_REASONING_STRING] + [t[1] for t in MMLU_STRING])
245
+
246
+ # Convert to dict for lighteval
247
+ TASKS_TABLE = [task.as_dict() for task in _TASKS]
248
+ # You can have a few pre-organised groups of tasks
249
+ TASKS_GROUPS = {
250
+ "early-signal": EARLY_SIGNAL_TASKS,
251
+ }