Spaces:
Running
Running
FIX: MATH
#4
by
Cookize
- opened
- .gitignore +1 -5
- README.md +4 -9
- index.html +0 -0
- pyproject.toml +0 -14
- tasks.py +164 -188
- tlem.py +53 -142
- utils.py +278 -112
.gitignore
CHANGED
@@ -1,6 +1,2 @@
|
|
1 |
__pycache__
|
2 |
-
|
3 |
-
tests
|
4 |
-
|
5 |
-
README_files
|
6 |
-
.ipynb_checkpoints
|
|
|
1 |
__pycache__
|
2 |
+
tlem.ju.py
|
|
|
|
|
|
|
|
README.md
CHANGED
@@ -1,10 +1,5 @@
|
|
1 |
-
---
|
2 |
-
title: TLEM
|
3 |
-
emoji: 🐢
|
4 |
-
colorFrom: yellow
|
5 |
-
colorTo: purple
|
6 |
-
sdk: static
|
7 |
-
pinned: false
|
8 |
-
---
|
9 |
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
|
2 |
+
|
3 |
+
# Transparent LLMs Evaluation Metrics
|
4 |
+
|
5 |
+
> LLMs belong to *tout le monde*
|
index.html
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
DELETED
@@ -1,14 +0,0 @@
|
|
1 |
-
[tool.poetry]
|
2 |
-
name = "tlem"
|
3 |
-
version = "0.1.0"
|
4 |
-
description = ""
|
5 |
-
authors = ["fecet <[email protected]>"]
|
6 |
-
readme = "README.md"
|
7 |
-
|
8 |
-
[tool.poetry.dependencies]
|
9 |
-
python = "^3.10"
|
10 |
-
|
11 |
-
|
12 |
-
[build-system]
|
13 |
-
requires = ["poetry-core"]
|
14 |
-
build-backend = "poetry.core.masonry.api"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tasks.py
CHANGED
@@ -14,7 +14,6 @@ from .utils import *
|
|
14 |
from evaluate import load
|
15 |
from collections import defaultdict
|
16 |
import sys
|
17 |
-
from pathlib import Path
|
18 |
|
19 |
|
20 |
# if sys.version_info >= (3, 9):
|
@@ -59,15 +58,13 @@ class Task:
|
|
59 |
dataset_name: str | tuple[str, str] = ("gsm8k", "main")
|
60 |
split: str = "test"
|
61 |
# metrics: list[str] = field(default_factory=list)
|
62 |
-
metric_name: str | tuple[str, str] = ("sustech/tlem", "
|
63 |
input_column: str = "question"
|
64 |
label_column: str = ""
|
65 |
-
output_column: str = "generated_text"
|
66 |
prompt: Optional[Callable | str] = None
|
67 |
few_shot: int = 0
|
68 |
few_shot_from: Optional[str] = None
|
69 |
# results: dict[str, Any] = field(default_factory=dict)
|
70 |
-
# outputs: Optional[list] = field(default_factory=list)
|
71 |
|
72 |
def __post_init__(self):
|
73 |
names = (
|
@@ -75,7 +72,7 @@ class Task:
|
|
75 |
if isinstance(self.dataset_name, str)
|
76 |
else list(self.dataset_name)
|
77 |
)
|
78 |
-
names[0] =
|
79 |
|
80 |
self.name = "-".join(names) + f"-{self.split}"
|
81 |
if isinstance(self.prompt, str):
|
@@ -87,21 +84,10 @@ class Task:
|
|
87 |
}
|
88 |
self.label_column = self.label_column or self.input_column
|
89 |
|
90 |
-
def __eq__(self, __value: object) -> bool:
|
91 |
-
return self.name == __value.name
|
92 |
-
|
93 |
@cached_property
|
94 |
def samples(self):
|
95 |
return self.dataset[self.input_column]
|
96 |
|
97 |
-
@cached_property
|
98 |
-
def labels(self):
|
99 |
-
return self.dataset[self.label_column]
|
100 |
-
|
101 |
-
@cached_property
|
102 |
-
def outputs(self):
|
103 |
-
return self.dataset[self.output_column]
|
104 |
-
|
105 |
@cached_property
|
106 |
def dataset(self):
|
107 |
ds = (
|
@@ -132,7 +118,6 @@ class Task:
|
|
132 |
shots = shots.map(
|
133 |
lambda example: {
|
134 |
self.input_column: example[self.input_column]
|
135 |
-
+ "\n"
|
136 |
+ example[self.label_column],
|
137 |
}
|
138 |
)[self.input_column]
|
@@ -155,37 +140,33 @@ class Task:
|
|
155 |
if isinstance(self.metric_name, str)
|
156 |
else load(*self.metric_name)
|
157 |
)
|
158 |
-
return metric
|
159 |
-
|
160 |
-
@property
|
161 |
-
def result(self) -> dict:
|
162 |
-
assert self.outputs, "Please run the task first."
|
163 |
-
results = self.metric(self.outputs, self.labels)
|
164 |
-
# logging.info(f"{self.name}:{results}")
|
165 |
-
return results
|
166 |
|
|
|
167 |
def run(
|
168 |
self,
|
169 |
pipeline,
|
170 |
):
|
171 |
-
if self.
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
|
|
|
|
|
|
|
|
|
|
184 |
|
185 |
-
|
186 |
-
self.dataset.select_columns(
|
187 |
-
[self.input_column, self.output_column, self.label_column]
|
188 |
-
).save_to_disk(path)
|
189 |
|
190 |
|
191 |
def multichoice(responses: Any, references: list[str]):
|
@@ -209,8 +190,10 @@ def multichoice_zh(responses: Any, references: list[str]):
|
|
209 |
class Metrics:
|
210 |
cmmlu = multichoice_zh
|
211 |
mmlu = multichoice
|
212 |
-
|
213 |
-
ceval
|
|
|
|
|
214 |
|
215 |
def winogrande(responses: list[str], answers: list[str | int]):
|
216 |
responses = [first_option_postprocess(pred, options="AB") for pred in responses]
|
@@ -235,11 +218,18 @@ class Metrics:
|
|
235 |
return responses, answers
|
236 |
|
237 |
def drop(responses: list[str], answers: list[list]):
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
243 |
|
244 |
def bbh_mcq(responses: list[str], answers: list[str | int]):
|
245 |
if len(responses) != len(answers):
|
@@ -270,25 +260,17 @@ class Metrics:
|
|
270 |
|
271 |
return responses, answers
|
272 |
|
273 |
-
def boolq(responses: list[str], answers: list[str | int]):
|
274 |
-
|
275 |
-
responses = [first_capital_postprocess(response) for response in responses]
|
276 |
-
answers = ["A" if answer else "B" for answer in answers]
|
277 |
-
|
278 |
-
return responses, answers
|
279 |
-
|
280 |
def MATH(responses: list[str], answers: list[str]):
|
281 |
-
extract_responses =
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
)
|
289 |
-
|
290 |
-
|
291 |
-
return sync_pipe(is_equiv)(zip(extract_responses, extract_answers))
|
292 |
|
293 |
|
294 |
class CMMLU:
|
@@ -301,7 +283,7 @@ class CMMLU:
|
|
301 |
for choice in list("ABCD"):
|
302 |
prompt += f"\n{choice}. {example[choice]}"
|
303 |
|
304 |
-
|
305 |
return {"prompt": prompt}
|
306 |
|
307 |
subcategories = {
|
@@ -398,6 +380,7 @@ class CMMLU:
|
|
398 |
],
|
399 |
"Other": ["other"],
|
400 |
"China specific": ["china specific"],
|
|
|
401 |
}
|
402 |
|
403 |
@classmethod
|
@@ -624,29 +607,73 @@ class DROP:
|
|
624 |
input_column = "input"
|
625 |
label_column = "answers"
|
626 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
627 |
@classmethod
|
628 |
def prompt_drop(cls, example):
|
629 |
-
prompt =
|
|
|
|
|
630 |
|
631 |
-
|
632 |
-
|
633 |
-
|
634 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
635 |
|
636 |
@classmethod
|
637 |
def suite(
|
638 |
cls,
|
639 |
):
|
640 |
-
|
641 |
-
|
642 |
-
|
643 |
-
|
644 |
-
|
645 |
-
|
646 |
-
|
647 |
-
|
648 |
-
split="validation",
|
649 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
650 |
|
651 |
|
652 |
class HellaSwag:
|
@@ -717,17 +744,28 @@ class ARC:
|
|
717 |
|
718 |
@classmethod
|
719 |
def suite(cls):
|
720 |
-
|
721 |
-
|
722 |
-
|
723 |
-
|
724 |
-
|
725 |
-
|
726 |
-
|
727 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
728 |
)
|
729 |
-
for subset in cls.categories
|
730 |
-
]
|
731 |
|
732 |
return suite
|
733 |
|
@@ -789,107 +827,45 @@ class BBH:
|
|
789 |
def suite(
|
790 |
cls,
|
791 |
):
|
792 |
-
|
793 |
-
|
794 |
-
|
795 |
-
|
796 |
-
|
797 |
-
|
798 |
-
|
799 |
-
|
800 |
-
|
801 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
802 |
)
|
803 |
-
|
804 |
-
|
805 |
-
|
806 |
-
|
807 |
-
|
808 |
-
|
809 |
-
|
810 |
-
|
811 |
-
|
812 |
-
|
813 |
)
|
814 |
-
)
|
815 |
-
|
816 |
-
return suite
|
817 |
-
|
818 |
-
|
819 |
-
class BoolQ:
|
820 |
-
input_column = "input"
|
821 |
-
label_column = "answer"
|
822 |
-
|
823 |
-
@classmethod
|
824 |
-
def prompt_boolq(cls, example, chat=False):
|
825 |
-
|
826 |
-
prompt = f"{example['passage']}\nQuestion: {example['question']}\nA. Yes\nB. No\nAnswer: "
|
827 |
-
|
828 |
-
return {"input": prompt}
|
829 |
-
|
830 |
-
@classmethod
|
831 |
-
def suite(cls, chat: bool):
|
832 |
-
|
833 |
-
suite = [
|
834 |
-
Task(
|
835 |
-
dataset_name="boolq",
|
836 |
-
metric_name=("sustech/tlem", "boolq"),
|
837 |
-
input_column=cls.input_column,
|
838 |
-
label_column=cls.label_column,
|
839 |
-
prompt=partial(cls.prompt_boolq, chat=chat),
|
840 |
-
few_shot=0 if chat else 5,
|
841 |
-
few_shot_from="train",
|
842 |
-
split="validation",
|
843 |
-
)
|
844 |
-
]
|
845 |
|
846 |
return suite
|
847 |
-
|
848 |
-
class TruthfulQAMC1:
|
849 |
-
input_column = "input"
|
850 |
-
label_column = "answer"
|
851 |
-
|
852 |
-
@classmethod
|
853 |
-
def prompt_truthful_qa(cls, example):
|
854 |
-
|
855 |
-
target = example["mc1_targets"]
|
856 |
-
choices = target["choices"]
|
857 |
-
labels = target["labels"]
|
858 |
-
|
859 |
-
prompt = f"The following is a multiple-choice question. Please choose the most suitable one as the answer to this question.\n\n"
|
860 |
-
prompt += example["question"]
|
861 |
-
|
862 |
-
answer = []
|
863 |
-
|
864 |
-
for idx, choice, label in zip(list("ABCDEFGHIJ")[:len(choices)], choices, labels):
|
865 |
-
|
866 |
-
prompt += f"\n{idx}. {choice}"
|
867 |
-
|
868 |
-
if label == 1:
|
869 |
-
answer = idx
|
870 |
-
|
871 |
-
prompt += "\nAnswer: "
|
872 |
-
|
873 |
-
return {
|
874 |
-
"input": prompt,
|
875 |
-
"answer": answer
|
876 |
-
}
|
877 |
-
|
878 |
-
@classmethod
|
879 |
-
def suite(cls):
|
880 |
-
suite = [
|
881 |
-
Task(
|
882 |
-
dataset_name=("truthful_qa", "multiple_choice"),
|
883 |
-
metric_name=("sustech/tlem", "truthful_qa_mc1"),
|
884 |
-
input_column=cls.input_column,
|
885 |
-
label_column=cls.label_column,
|
886 |
-
prompt=partial(cls.prompt_truthful_qa),
|
887 |
-
few_shot=0,
|
888 |
-
split="validation",
|
889 |
-
)
|
890 |
-
]
|
891 |
|
892 |
-
return suite
|
893 |
|
894 |
class CEVAL:
|
895 |
input_column = "input"
|
|
|
14 |
from evaluate import load
|
15 |
from collections import defaultdict
|
16 |
import sys
|
|
|
17 |
|
18 |
|
19 |
# if sys.version_info >= (3, 9):
|
|
|
58 |
dataset_name: str | tuple[str, str] = ("gsm8k", "main")
|
59 |
split: str = "test"
|
60 |
# metrics: list[str] = field(default_factory=list)
|
61 |
+
metric_name: str | tuple[str, str] = ("sustech/tlem", "gsm8k")
|
62 |
input_column: str = "question"
|
63 |
label_column: str = ""
|
|
|
64 |
prompt: Optional[Callable | str] = None
|
65 |
few_shot: int = 0
|
66 |
few_shot_from: Optional[str] = None
|
67 |
# results: dict[str, Any] = field(default_factory=dict)
|
|
|
68 |
|
69 |
def __post_init__(self):
|
70 |
names = (
|
|
|
72 |
if isinstance(self.dataset_name, str)
|
73 |
else list(self.dataset_name)
|
74 |
)
|
75 |
+
names[0] = names[0].split("/")[-1]
|
76 |
|
77 |
self.name = "-".join(names) + f"-{self.split}"
|
78 |
if isinstance(self.prompt, str):
|
|
|
84 |
}
|
85 |
self.label_column = self.label_column or self.input_column
|
86 |
|
|
|
|
|
|
|
87 |
@cached_property
|
88 |
def samples(self):
|
89 |
return self.dataset[self.input_column]
|
90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
@cached_property
|
92 |
def dataset(self):
|
93 |
ds = (
|
|
|
118 |
shots = shots.map(
|
119 |
lambda example: {
|
120 |
self.input_column: example[self.input_column]
|
|
|
121 |
+ example[self.label_column],
|
122 |
}
|
123 |
)[self.input_column]
|
|
|
140 |
if isinstance(self.metric_name, str)
|
141 |
else load(*self.metric_name)
|
142 |
)
|
143 |
+
return metric
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
|
145 |
+
# @cache
|
146 |
def run(
|
147 |
self,
|
148 |
pipeline,
|
149 |
):
|
150 |
+
if (outputs := pipeline(self.samples)) is None:
|
151 |
+
logging.warning("pipeline returns None")
|
152 |
+
return
|
153 |
+
self.outputs = outputs
|
154 |
+
try:
|
155 |
+
try:
|
156 |
+
result = self.metric._compute(
|
157 |
+
responses=outputs, references=self.dataset[self.label_column]
|
158 |
+
)
|
159 |
+
except Exception as e:
|
160 |
+
result = self.metric.compute(
|
161 |
+
responses=outputs, references=self.dataset[self.label_column]
|
162 |
+
)
|
163 |
+
except Exception as e:
|
164 |
+
result = outputs
|
165 |
+
# if log:
|
166 |
+
# name = name or pipeline.__name__
|
167 |
+
# self.results[name] = result
|
168 |
|
169 |
+
return result
|
|
|
|
|
|
|
170 |
|
171 |
|
172 |
def multichoice(responses: Any, references: list[str]):
|
|
|
190 |
class Metrics:
|
191 |
cmmlu = multichoice_zh
|
192 |
mmlu = multichoice
|
193 |
+
|
194 |
+
def ceval(responses: list[str], answers: list[str | int]):
|
195 |
+
responses = [extract_choice_zh(pred) for pred in responses]
|
196 |
+
return responses, answers
|
197 |
|
198 |
def winogrande(responses: list[str], answers: list[str | int]):
|
199 |
responses = [first_option_postprocess(pred, options="AB") for pred in responses]
|
|
|
218 |
return responses, answers
|
219 |
|
220 |
def drop(responses: list[str], answers: list[list]):
|
221 |
+
if len(responses) != len(answers):
|
222 |
+
return {"error": "predictions and references have different " "length"}
|
223 |
+
responses = [general_postprocess(pred) for pred in responses]
|
224 |
+
processed_answers = [[general_postprocess(j) for j in i] for i in answers]
|
225 |
+
matched_answers = []
|
226 |
+
for pred, ans, origin_ans in zip(responses, processed_answers, answers):
|
227 |
+
if pred in ans or pred in origin_ans:
|
228 |
+
matched_answers.append(pred)
|
229 |
+
else:
|
230 |
+
matched_answers.append(ans[0])
|
231 |
+
|
232 |
+
return responses, matched_answers
|
233 |
|
234 |
def bbh_mcq(responses: list[str], answers: list[str | int]):
|
235 |
if len(responses) != len(answers):
|
|
|
260 |
|
261 |
return responses, answers
|
262 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
def MATH(responses: list[str], answers: list[str]):
|
264 |
+
extract_responses = []
|
265 |
+
for response in responses:
|
266 |
+
indices = [pos for pos, char in enumerate(response) if char == "$"]
|
267 |
+
if len(indices) <= 2:
|
268 |
+
ans = ""
|
269 |
+
else:
|
270 |
+
ans = response[indices[-2] + 1 : indices[-1]]
|
271 |
+
extract_responses.append(strip_string(ans))
|
272 |
+
extract_answers = [strip_string(get_answer(answer)) for answer in answers]
|
273 |
+
return extract_responses, extract_answers
|
|
|
274 |
|
275 |
|
276 |
class CMMLU:
|
|
|
283 |
for choice in list("ABCD"):
|
284 |
prompt += f"\n{choice}. {example[choice]}"
|
285 |
|
286 |
+
prompt += "\n答案:"
|
287 |
return {"prompt": prompt}
|
288 |
|
289 |
subcategories = {
|
|
|
380 |
],
|
381 |
"Other": ["other"],
|
382 |
"China specific": ["china specific"],
|
383 |
+
"Test": ["computer science"],
|
384 |
}
|
385 |
|
386 |
@classmethod
|
|
|
607 |
input_column = "input"
|
608 |
label_column = "answers"
|
609 |
|
610 |
+
icl_prompt = """\
|
611 |
+
Text: In the county, the population was spread out with 23.50% under the age of 18, 8.70% from 18 to 24, 29.70% from 25 to 44, 24.70% from 45 to 64, and 13.30% who were 65 years of age or older.
|
612 |
+
Question: How many more percent are under the age of 18 compared to the 18 to 24 group?
|
613 |
+
Anawer: According to the text, 23.5% are under the age of 18, and 8.7% are from ages 18 to 24. 23.5%-8.7%=14.8%. So the answer is 14.8.
|
614 |
+
|
615 |
+
Text: Playing in their second straight Thanksgiving game, the Eagles struggled especially on defense, where they were unable to stop the much-hyped Lions offense. The worst of it all was how unproven rookie Eric Rowe was tasked with covering wide receiver Calvin Johnson, leading to Johnson catching 3 touchdowns. Stafford’s five passing touchdowns, including three of them to Johnson was too much for the Eagles to overcome and for the second consecutive time this season, the Eagles gave up 45 points in a game. With the loss, the Eagles drop to 4-7 on the season and 6-1 when playing on Thanksgiving.
|
616 |
+
Question: How many TD passes did Stafford throw other than to Johnson?
|
617 |
+
Anawer: According to the text, Stafford threw 5 TD passes, 3 of which were to Johnson. 5-3=2. So the answer is 2.
|
618 |
+
|
619 |
+
Text: [PROMPT]
|
620 |
+
Question: [QUESTION]
|
621 |
+
Anawer:"""
|
622 |
+
|
623 |
+
categories = ["validation"]
|
624 |
+
|
625 |
@classmethod
|
626 |
def prompt_drop(cls, example):
|
627 |
+
prompt = cls.icl_prompt.replace("[PROMPT]", example["passage"]).replace(
|
628 |
+
"[QUESTION]", example["question"]
|
629 |
+
)
|
630 |
|
631 |
+
validated_answers = example["answers_spans"]["spans"]
|
632 |
+
validated_types = example["answers_spans"]["types"]
|
633 |
+
answers = []
|
634 |
+
for answer_item, answer_type in zip(validated_answers, validated_types):
|
635 |
+
# if answer_type == "number":
|
636 |
+
# answers.append(answer_item)
|
637 |
+
# elif any(answer_item['date'][i] for i in ['day', 'month', 'year']):
|
638 |
+
# d = [answer_item['date'][i] for i in ['day', 'month', 'year']]
|
639 |
+
# answers.append(' '.join(d).strip())
|
640 |
+
# else:
|
641 |
+
# for span in answer_item['spans']:
|
642 |
+
# answers.append(span)
|
643 |
+
answers.append(answer_item)
|
644 |
+
answers = list(set(answers))
|
645 |
+
|
646 |
+
return {cls.input_column: prompt, cls.label_column: answers}
|
647 |
|
648 |
@classmethod
|
649 |
def suite(
|
650 |
cls,
|
651 |
):
|
652 |
+
finer_categories = (
|
653 |
+
pd.Series(cls.categories) # noqa # type: ignore
|
654 |
+
.explode()
|
655 |
+
.reset_index()
|
656 |
+
.set_index(0)
|
657 |
+
.groupby(0)
|
658 |
+
.agg(list)["index"]
|
659 |
+
.to_dict()
|
|
|
660 |
)
|
661 |
+
suite = defaultdict(list)
|
662 |
+
categories = list(finer_categories.keys())
|
663 |
+
for cate in categories:
|
664 |
+
suite[cate].append(
|
665 |
+
Task(
|
666 |
+
("drop", cate),
|
667 |
+
metric_name=("sustech/tlem", "drop"),
|
668 |
+
input_column=cls.input_column,
|
669 |
+
label_column=cls.label_column,
|
670 |
+
prompt=partial(cls.prompt_drop),
|
671 |
+
few_shot=0,
|
672 |
+
split="validation",
|
673 |
+
)
|
674 |
+
)
|
675 |
+
|
676 |
+
return suite
|
677 |
|
678 |
|
679 |
class HellaSwag:
|
|
|
744 |
|
745 |
@classmethod
|
746 |
def suite(cls):
|
747 |
+
finer_categories = (
|
748 |
+
pd.Series(cls.categories) # noqa # type: ignore
|
749 |
+
.explode()
|
750 |
+
.reset_index()
|
751 |
+
.set_index(0)
|
752 |
+
.groupby(0)
|
753 |
+
.agg(list)["index"]
|
754 |
+
.to_dict()
|
755 |
+
)
|
756 |
+
suite = defaultdict(list)
|
757 |
+
categories = list(finer_categories.keys())
|
758 |
+
for cate in categories:
|
759 |
+
suite[cate].append(
|
760 |
+
Task(
|
761 |
+
("ai2_arc", cate),
|
762 |
+
metric_name=("sustech/tlem", "arc"),
|
763 |
+
input_column=cls.input_column,
|
764 |
+
label_column=cls.label_column,
|
765 |
+
prompt=partial(cls.prompt_arc),
|
766 |
+
few_shot=0,
|
767 |
+
)
|
768 |
)
|
|
|
|
|
769 |
|
770 |
return suite
|
771 |
|
|
|
827 |
def suite(
|
828 |
cls,
|
829 |
):
|
830 |
+
finer_categories = (
|
831 |
+
pd.Series(
|
832 |
+
cls.bbh_free_form_sets + cls.bbh_multiple_choice_sets
|
833 |
+
) # noqa # type: ignore
|
834 |
+
.explode()
|
835 |
+
.reset_index()
|
836 |
+
.set_index(0)
|
837 |
+
.groupby(0)
|
838 |
+
.agg(list)["index"]
|
839 |
+
.to_dict()
|
840 |
+
)
|
841 |
+
suite = defaultdict(list)
|
842 |
+
categories = list(finer_categories.keys())
|
843 |
+
for cate in categories:
|
844 |
+
if cate in cls.bbh_multiple_choice_sets:
|
845 |
+
suite[cate].append(
|
846 |
+
Task(
|
847 |
+
("lukaemon/bbh", cate),
|
848 |
+
metric_name=("sustech/tlem", "bbh_mcq"),
|
849 |
+
input_column=cls.input_column,
|
850 |
+
label_column=cls.label_column,
|
851 |
+
prompt=partial(cls.prompt_bbh, category=cate),
|
852 |
+
few_shot=0,
|
853 |
+
)
|
854 |
)
|
855 |
+
else:
|
856 |
+
suite[cate].append(
|
857 |
+
Task(
|
858 |
+
("lukaemon/bbh", cate),
|
859 |
+
metric_name=("sustech/tlem", "bbh_freefrom"),
|
860 |
+
input_column=cls.input_column,
|
861 |
+
label_column=cls.label_column,
|
862 |
+
prompt=partial(cls.prompt_bbh, category=cate),
|
863 |
+
few_shot=0,
|
864 |
+
)
|
865 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
866 |
|
867 |
return suite
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
868 |
|
|
|
869 |
|
870 |
class CEVAL:
|
871 |
input_column = "input"
|
tlem.py
CHANGED
@@ -1,6 +1,12 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
from typing import Any, Optional, Protocol, Iterable, Callable
|
|
|
4 |
from tqdm.auto import tqdm
|
5 |
from evaluate.evaluation_suite import EvaluationSuite
|
6 |
import evaluate
|
@@ -8,10 +14,7 @@ import numpy as np
|
|
8 |
import datasets
|
9 |
import pandas as pd
|
10 |
from .tasks import *
|
11 |
-
from .utils import
|
12 |
-
from itertools import chain
|
13 |
-
from copy import deepcopy
|
14 |
-
from . import utils
|
15 |
|
16 |
|
17 |
class ReasoningMetric(evaluate.Metric):
|
@@ -43,60 +46,32 @@ class ReasoningMetric(evaluate.Metric):
|
|
43 |
reference_urls=["http://path.to.reference.url/new_module"],
|
44 |
)
|
45 |
|
46 |
-
def _compute(self, responses, references):
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
|
|
66 |
return results
|
67 |
|
68 |
|
69 |
class Suite(EvaluationSuite):
|
70 |
task_class = Task
|
71 |
-
utils = utils
|
72 |
-
supported_datasets = [
|
73 |
-
"arc",
|
74 |
-
"hellaswag",
|
75 |
-
"mmlu-chat",
|
76 |
-
"winogrande",
|
77 |
-
"gsm8k",
|
78 |
-
"cmmlu-chat",
|
79 |
-
"ceval-chat",
|
80 |
-
"bbh",
|
81 |
-
"drop",
|
82 |
-
"MATH",
|
83 |
-
]
|
84 |
-
|
85 |
-
def __getitem__(self, key) -> Task:
|
86 |
-
match key:
|
87 |
-
case str():
|
88 |
-
return self.suite[key]
|
89 |
-
case slice() | int():
|
90 |
-
return self.tasks[key]
|
91 |
-
|
92 |
-
def agg(self, suite):
|
93 |
-
for cate, tasks in suite.items():
|
94 |
-
if isinstance(tasks, dict):
|
95 |
-
suite[cate] = self.agg(tasks)
|
96 |
-
else:
|
97 |
-
suite[cate] = np.mean([pd.Series(task.result).mean() for task in tasks])
|
98 |
-
|
99 |
-
return suite
|
100 |
|
101 |
def run(
|
102 |
self,
|
@@ -104,26 +79,28 @@ class Suite(EvaluationSuite):
|
|
104 |
) -> dict[str, float]:
|
105 |
self.assert_suite_nonempty()
|
106 |
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
|
|
113 |
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
|
|
119 |
|
120 |
-
|
121 |
|
122 |
-
|
|
|
123 |
|
124 |
-
def
|
125 |
chat = False
|
126 |
-
suite={}
|
127 |
match name:
|
128 |
case _ if "chat" in name:
|
129 |
chat = True
|
@@ -132,8 +109,6 @@ class Suite(EvaluationSuite):
|
|
132 |
suite = MMLU.suite(chat=chat)
|
133 |
case _ if name.startswith("cmmlu"):
|
134 |
suite = CMMLU.suite(chat=chat)
|
135 |
-
case _ if name.startswith("ceval"):
|
136 |
-
suite = CEVAL.suite(chat=chat)
|
137 |
case "gsm8k":
|
138 |
suite = Task(
|
139 |
dataset_name=("gsm8k", "main"),
|
@@ -151,10 +126,8 @@ class Suite(EvaluationSuite):
|
|
151 |
suite = DROP.suite()
|
152 |
case "winogrande":
|
153 |
suite = Winogrande.suite()
|
154 |
-
case "
|
155 |
-
suite =
|
156 |
-
case _ if name.startswith("boolq"):
|
157 |
-
suite = BoolQ.suite(chat=chat)
|
158 |
case "mt_bench":
|
159 |
suite = Task(
|
160 |
dataset_name="SUSTech/mt_bench_judge",
|
@@ -165,78 +138,16 @@ class Suite(EvaluationSuite):
|
|
165 |
case "MATH" | "competition_math":
|
166 |
suite = Task(
|
167 |
dataset_name="hendrycks/competition_math",
|
168 |
-
|
|
|
169 |
metric_name=("sustech/tlem", "MATH"),
|
170 |
input_column="problem",
|
171 |
label_column="solution",
|
172 |
)
|
173 |
|
174 |
-
|
175 |
-
for name in [
|
176 |
-
"arc",
|
177 |
-
"hellaswag",
|
178 |
-
"mmlu-chat",
|
179 |
-
"winogrande",
|
180 |
-
"gsm8k",
|
181 |
-
# "truthful_qa",
|
182 |
-
"drop",
|
183 |
-
]:
|
184 |
-
suite.update(self.get_suite(name))
|
185 |
-
case "tlem":
|
186 |
-
for name in [
|
187 |
-
"arc",
|
188 |
-
"hellaswag",
|
189 |
-
"mmlu-chat",
|
190 |
-
"winogrande",
|
191 |
-
"gsm8k",
|
192 |
-
# "truthful_qa",
|
193 |
-
"cmmlu-chat",
|
194 |
-
"ceval-chat",
|
195 |
-
"bbh",
|
196 |
-
]:
|
197 |
-
suite.update(self.get_suite(name))
|
198 |
-
|
199 |
-
case "all":
|
200 |
-
for name in self.supported_datasets:
|
201 |
-
suite.update(self.get_suite(name))
|
202 |
-
case _:
|
203 |
-
raise NotImplementedError(
|
204 |
-
f"{name} is not supported in {self.supported_datasets}"
|
205 |
-
)
|
206 |
-
|
207 |
-
if isinstance(suite, Task):
|
208 |
-
suite = [suite]
|
209 |
-
suite = {name: suite}
|
210 |
-
|
211 |
-
return suite
|
212 |
-
|
213 |
-
def singleton(self, task):
|
214 |
-
try:
|
215 |
-
return self.tasks[self.tasks.index(task)]
|
216 |
-
except ValueError:
|
217 |
-
logging.debug(f"add {task.name} to suite.")
|
218 |
-
self.tasks.append(task)
|
219 |
-
logging.debug(self.tasks)
|
220 |
-
return self.tasks[-1]
|
221 |
-
|
222 |
-
def drop_duplicates(self, suite):
|
223 |
-
for category, tasks in suite.items():
|
224 |
-
match tasks:
|
225 |
-
case list():
|
226 |
-
suite[category] = [self.singleton(task) for task in tasks]
|
227 |
-
case dict():
|
228 |
-
suite[category] = self.drop_duplicates(tasks)
|
229 |
-
case _:
|
230 |
-
raise NotImplementedError
|
231 |
-
return suite
|
232 |
-
|
233 |
-
def load(self, name):
|
234 |
-
sub_suite = self.get_suite(name)
|
235 |
-
self.suite.update(sub_suite)
|
236 |
-
self.suite = self.drop_duplicates(self.suite)
|
237 |
-
# return self
|
238 |
|
239 |
def __init__(self, name="tlem"):
|
240 |
super().__init__(name)
|
241 |
-
self.
|
242 |
-
self.suite =
|
|
|
1 |
+
# %%
|
2 |
+
|
3 |
+
try:
|
4 |
+
from ipytorch import logging
|
5 |
+
except Exception as e:
|
6 |
+
import logging
|
7 |
|
8 |
from typing import Any, Optional, Protocol, Iterable, Callable
|
9 |
+
from numpy.lib import extract
|
10 |
from tqdm.auto import tqdm
|
11 |
from evaluate.evaluation_suite import EvaluationSuite
|
12 |
import evaluate
|
|
|
14 |
import datasets
|
15 |
import pandas as pd
|
16 |
from .tasks import *
|
17 |
+
from .utils import is_equiv
|
|
|
|
|
|
|
18 |
|
19 |
|
20 |
class ReasoningMetric(evaluate.Metric):
|
|
|
46 |
reference_urls=["http://path.to.reference.url/new_module"],
|
47 |
)
|
48 |
|
49 |
+
def _compute(self, responses, references, verbose=False):
|
50 |
+
extract_responses, extract_references = getattr(Metrics, self.config_name)(
|
51 |
+
responses, references
|
52 |
+
)
|
53 |
+
df = pd.DataFrame(
|
54 |
+
{
|
55 |
+
"responses": responses,
|
56 |
+
"references": references,
|
57 |
+
}
|
58 |
+
)
|
59 |
+
df["extract_responses"] = extract_responses
|
60 |
+
df["extract_references"] = extract_references
|
61 |
+
# print(df)
|
62 |
+
results = {
|
63 |
+
"Accuracy": (df["extract_references"] == df["extract_responses"])
|
64 |
+
.astype(int)
|
65 |
+
.mean(),
|
66 |
+
}
|
67 |
+
logging.info(results)
|
68 |
+
if verbose:
|
69 |
+
results["df"] = df
|
70 |
return results
|
71 |
|
72 |
|
73 |
class Suite(EvaluationSuite):
|
74 |
task_class = Task
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
def run(
|
77 |
self,
|
|
|
79 |
) -> dict[str, float]:
|
80 |
self.assert_suite_nonempty()
|
81 |
|
82 |
+
def run_tasks(tasks):
|
83 |
+
for task in (bar := tqdm(tasks, leave=False)):
|
84 |
+
bar.desc = f"complete {task.name}."
|
85 |
+
if task.name not in self.cached_result:
|
86 |
+
self.cached_result[task.name] = task.run(model_or_pipeline)
|
87 |
+
results = [self.cached_result[task.name] for task in tasks]
|
88 |
+
return pd.DataFrame(results).mean().to_dict()
|
89 |
|
90 |
+
if isinstance(self.suite, dict):
|
91 |
+
for category, tasks in (bar := tqdm(self.suite.items())):
|
92 |
+
bar.desc = f"complete {category}."
|
93 |
+
logging.warning(f"Combined results {category}: {run_tasks(tasks)}")
|
94 |
+
else:
|
95 |
+
logging.warning(f"Combined results: {run_tasks(self.suite)}")
|
96 |
|
97 |
+
return self.cached_result
|
98 |
|
99 |
+
def add(self, name):
|
100 |
+
self.load(name)
|
101 |
|
102 |
+
def load(self, name):
|
103 |
chat = False
|
|
|
104 |
match name:
|
105 |
case _ if "chat" in name:
|
106 |
chat = True
|
|
|
109 |
suite = MMLU.suite(chat=chat)
|
110 |
case _ if name.startswith("cmmlu"):
|
111 |
suite = CMMLU.suite(chat=chat)
|
|
|
|
|
112 |
case "gsm8k":
|
113 |
suite = Task(
|
114 |
dataset_name=("gsm8k", "main"),
|
|
|
126 |
suite = DROP.suite()
|
127 |
case "winogrande":
|
128 |
suite = Winogrande.suite()
|
129 |
+
case _ if name.startswith("ceval"):
|
130 |
+
suite = CEVAL.suite(chat=chat)
|
|
|
|
|
131 |
case "mt_bench":
|
132 |
suite = Task(
|
133 |
dataset_name="SUSTech/mt_bench_judge",
|
|
|
138 |
case "MATH" | "competition_math":
|
139 |
suite = Task(
|
140 |
dataset_name="hendrycks/competition_math",
|
141 |
+
split="test",
|
142 |
+
prompt="This is a math problem, please think step by step and slove it: {input_column}",
|
143 |
metric_name=("sustech/tlem", "MATH"),
|
144 |
input_column="problem",
|
145 |
label_column="solution",
|
146 |
)
|
147 |
|
148 |
+
self.suite = [suite] if isinstance(suite, Task) else suite
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
|
150 |
def __init__(self, name="tlem"):
|
151 |
super().__init__(name)
|
152 |
+
self.cached_result = {}
|
153 |
+
self.suite = []
|
utils.py
CHANGED
@@ -2,8 +2,6 @@ import logging
|
|
2 |
import re
|
3 |
import numpy as np
|
4 |
from typing import Any
|
5 |
-
from tqdm.auto import tqdm
|
6 |
-
import asyncio
|
7 |
|
8 |
NUMERIC_IN_EN = r"(?:[\s=+-/<>($:\.\*\\])(?=\S)((?:0|(?:\d{1,3}(?:,\d{3})+(?=\D|$))|(?:\d+))(?:\.\d+)?%?)(?:(?![^\s=+-/>)$:\.\*\\])|(?=, ))"
|
9 |
NUMERIC_IN_ZH = (
|
@@ -11,38 +9,6 @@ NUMERIC_IN_ZH = (
|
|
11 |
)
|
12 |
|
13 |
|
14 |
-
def async_pipe(func):
|
15 |
-
async def sync_function(samples):
|
16 |
-
if not isinstance(samples, list):
|
17 |
-
samples = [samples]
|
18 |
-
return await tqdm.gather(*[func(sample) for sample in samples], leave=False)
|
19 |
-
|
20 |
-
def sync_func(samples):
|
21 |
-
return asyncio.run(sync_function(samples))
|
22 |
-
|
23 |
-
return sync_func
|
24 |
-
|
25 |
-
|
26 |
-
def sync_pipe(func, progress=False):
|
27 |
-
def sync_func(samples):
|
28 |
-
return [
|
29 |
-
func(*sample) if isinstance(sample, tuple) else func(sample)
|
30 |
-
for sample in tqdm(samples, disable=not progress, leave=False)
|
31 |
-
]
|
32 |
-
|
33 |
-
return sync_func
|
34 |
-
|
35 |
-
|
36 |
-
def asis_backup(func):
|
37 |
-
def wrapper(sample):
|
38 |
-
try:
|
39 |
-
return func(sample)
|
40 |
-
except Exception:
|
41 |
-
return sample
|
42 |
-
|
43 |
-
return wrapper
|
44 |
-
|
45 |
-
|
46 |
def extract_choice_zh(gen):
|
47 |
# 答案是A | 选项是A | 应该选A选项
|
48 |
res = re.search(
|
@@ -74,27 +40,27 @@ def extract_choice_zh(gen):
|
|
74 |
def extract_choice(gen):
|
75 |
# answer is A | choice is A | choose A
|
76 |
res = re.search(
|
77 |
-
r"(?:(?:[Cc]hoose)|(?:(?:[Aa]nswer|[Cc]hoice)(?![^
|
78 |
gen,
|
79 |
)
|
80 |
|
81 |
# A is correct | A is right
|
82 |
if res is None:
|
83 |
res = re.search(
|
84 |
-
r"\b(A|B|C|D
|
85 |
gen,
|
86 |
)
|
87 |
|
88 |
# straight answer: A
|
89 |
if res is None:
|
90 |
-
res = re.search(r"^(A|B|C|D
|
91 |
|
92 |
# simply extract the first appearred letter
|
93 |
if res is None:
|
94 |
-
res = re.search(r"(?<![a-zA-Z])(A|B|C|D
|
95 |
|
96 |
if res is None:
|
97 |
-
res = "
|
98 |
|
99 |
if isinstance(res, str):
|
100 |
return res
|
@@ -136,20 +102,17 @@ def extract_numeric(string, pattern=NUMERIC_IN_EN) -> str:
|
|
136 |
|
137 |
|
138 |
def remove_boxed(s):
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
|
|
|
|
146 |
|
147 |
-
|
148 |
-
assert s[: len(left)] == left, s
|
149 |
-
return s[len(left) : -1]
|
150 |
-
raise ValueError(f"Cannot remove boxed from {s}")
|
151 |
-
except AssertionError:
|
152 |
-
return s
|
153 |
|
154 |
|
155 |
def last_boxed_only_string(string):
|
@@ -182,33 +145,183 @@ def last_boxed_only_string(string):
|
|
182 |
return retval
|
183 |
|
184 |
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
else:
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
|
197 |
def first_option_postprocess(text: str, options: str) -> str:
|
198 |
"""Find first valid option for text."""
|
199 |
|
200 |
patterns = [
|
201 |
-
f
|
202 |
-
f
|
203 |
-
f
|
204 |
-
f
|
205 |
-
f
|
206 |
-
f
|
207 |
-
f
|
208 |
-
f
|
209 |
-
f
|
210 |
-
f
|
211 |
-
f
|
212 |
]
|
213 |
|
214 |
regexes = [re.compile(pattern) for pattern in patterns]
|
@@ -219,41 +332,44 @@ def first_option_postprocess(text: str, options: str) -> str:
|
|
219 |
for i in options:
|
220 |
if i in outputs:
|
221 |
return i
|
222 |
-
return
|
223 |
|
224 |
|
225 |
def first_capital_postprocess(text: str) -> str:
|
226 |
for t in text:
|
227 |
if t.isupper():
|
228 |
return t
|
229 |
-
return
|
230 |
|
231 |
|
232 |
def general_postprocess(text: str) -> str:
|
233 |
# Cut off the first newline, period, or comma
|
234 |
-
truncated_text = re.split(r
|
235 |
|
236 |
# Remove punctuation
|
237 |
-
no_punctuation = re.sub(r
|
238 |
|
239 |
# Remove article
|
240 |
-
no_articles = re.sub(r
|
|
|
|
|
|
|
241 |
|
242 |
# Remove duplicated blank spaces
|
243 |
-
cleaned_text = re.sub(r
|
244 |
|
245 |
return cleaned_text
|
246 |
|
247 |
-
|
248 |
def bbh_mcq_postprocess(text: str) -> str:
|
249 |
ans = text
|
250 |
-
ans_line = ans.split(
|
251 |
if len(ans_line) != 1:
|
252 |
ans = ans_line[-1].strip()
|
253 |
-
match = re.search(r
|
254 |
if match:
|
255 |
return f"({match.group(1)})"
|
256 |
-
match = re.search(r
|
257 |
if match:
|
258 |
return f"({match.group(1)})"
|
259 |
return f"({ans})"
|
@@ -261,17 +377,18 @@ def bbh_mcq_postprocess(text: str) -> str:
|
|
261 |
|
262 |
def bbh_freeform_postprocess(text: str) -> str:
|
263 |
ans = text
|
264 |
-
ans_line = ans.split(
|
265 |
if len(ans_line) != 1:
|
266 |
ans = ans_line[-1].strip()
|
267 |
-
ans = ans.split(
|
268 |
-
if ans.endswith(
|
269 |
ans = ans[:-1]
|
270 |
return ans
|
271 |
|
272 |
|
|
|
273 |
icl_prompts = {
|
274 |
-
|
275 |
|
276 |
Q: Today, Emily went to the museum. Between what times could they have gone?
|
277 |
We know that:
|
@@ -348,7 +465,9 @@ Wake-up time: 5am.
|
|
348 |
3pm-4pm: free.
|
349 |
The beach closure time: 4pm.
|
350 |
The only time when Tiffany could have gone to the beach was 3pm to 4pm. So the answer is (D).""",
|
351 |
-
|
|
|
|
|
352 |
|
353 |
Q: In the following sentences, explain the antecedent of the pronoun (which thing the pronoun refers to), or state that it is ambiguous.
|
354 |
Sentence: The chief told the counselor that they took the day off.
|
@@ -385,7 +504,9 @@ Here we need to determine who the pronoun "his" might be referring to. There are
|
|
385 |
Let's consider Y first: "X will plan to meet Y at Y's office." This case makes sense, because X might want to meet up with Y at Y's office.
|
386 |
Now, consider X: "X will plan to meet Y at X's office." This case also makes sense, because X might want to meet up with Y at X's own office.
|
387 |
Because both X and Y are possible at the same time, we conclude that the antecedent of the pronoun is ambiguous. So the answer is (C).""",
|
388 |
-
|
|
|
|
|
389 |
|
390 |
Q: Today is Christmas Eve of 1937. What is the date 10 days ago in MM/DD/YYYY?
|
391 |
Options:
|
@@ -418,7 +539,9 @@ Options:
|
|
418 |
(F) 12/03/1960
|
419 |
A: Let's think step by step.
|
420 |
If Jane and John married on Jan 2, 1958, then and if it is their 5-year anniversary today, then today's date is Jan 2, 1963. The date tomorrow is Jan 3, 1963, that is 01/03/1963. So the answer is (B).""",
|
421 |
-
|
|
|
|
|
422 |
|
423 |
Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.
|
424 |
As the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the
|
@@ -458,7 +581,9 @@ A: Let's think step by step.
|
|
458 |
(2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
|
459 |
(3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
|
460 |
At the end of the dance, Alice is dancing with Patrick. So the answer is (C).""",
|
461 |
-
|
|
|
|
|
462 |
|
463 |
Q: Here is a table where the first line is a header and each subsequent line is a penguin: name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15 For example: the age of Louis is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm. We now add a penguin to the table:
|
464 |
James, 12, 90, 12
|
@@ -499,7 +624,9 @@ This question focuses on the name. We know the following: The names of the pengu
|
|
499 |
When we sort their names alphabetically, we get Bernard, Gwen, Louis, Vincent.
|
500 |
The name of the second penguin sorted by alphabetical order is Gwen.
|
501 |
The name of the second penguin sorted by alphabetic order is Gwen. So the answer is (D).""",
|
502 |
-
|
|
|
|
|
503 |
|
504 |
Q: This SVG path element <path d="M 31.00,73.00 L 32.00,59.00 L 44.00,50.00 L 49.00,41.00 L 64.00,37.00 L 71.00,55.00 L 64.00,76.00 L 52.00,61.00 L 31.00,73.00"/> draws a
|
505 |
Options:
|
@@ -577,7 +704,9 @@ Length of side B: |B| = sqrt((37.00-41.00)^2 + (34.00-33.00)^2)) = sqrt((4)^2 +
|
|
577 |
Length of side C: |C| = sqrt((41.00-45.00)^2 + (33.00-34.00)^2)) = sqrt((-4)^2 + (-1)^2) = sqrt(16 + 1) = sqrt(17).
|
578 |
Length of side D: |D| = sqrt((45.00-41.00)^2 + (34.00-43.00)^2)) = sqrt((4)^2 + (-9)^2) = sqrt(16 + 81) = sqrt(97).
|
579 |
Note that |A| = |D| and |B| = |C|. Furthermore, A and D are adjacent and B and C are adjacent. Thus, this polygon has two pairs of equal-length adjacent sides and is "kite". So the answer is (D).""",
|
580 |
-
|
|
|
|
|
581 |
|
582 |
According to Cambridge University Dictionary, sarcasm is "the use of remarks that clearly mean the opposite of what they say, made in order to hurt someone's feelings or to criticize something in a humorous way." Sarcastic sentences often contain satirical or ironic utterances, hyperboles, ambivalent or witty remarks.
|
583 |
|
@@ -607,7 +736,9 @@ A: Let's think step by step.
|
|
607 |
If we look at (A), it likens the consistency in the league's punishments with that in politics. Because politics or political affairs are often not considered to be consistent or dependable, this sentence appears to be satirical.
|
608 |
If we look at (B), it likens the consistency in the league's punishments with that in morality. Discussing the consistency of the league's punishments in the context of morality, ethics, or law makes sense and does not appear to make a satirical point about anything.
|
609 |
Above the above, the sarcastic option is (A). So the answer is (A).""",
|
610 |
-
|
|
|
|
|
611 |
|
612 |
Q: Which of the following is a humorous edit of this artist or movie name: 'whitesnake'?
|
613 |
Options:
|
@@ -651,7 +782,9 @@ The original name is "counting crows". This is the name of an American rock band
|
|
651 |
(C) "courting crows": Here the word "counting" is changed to "courting", and "courting" is an actual word; however, "courting crows" does not sound as humorous as "counting cows".
|
652 |
(D) "coutnting crows": Here the word "counting" is changed to "coutnting", but the word "coutnting" is not an actual word.
|
653 |
Above the above, the only humorous edit is (B). So the answer is (B).""",
|
654 |
-
|
|
|
|
|
655 |
|
656 |
Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.
|
657 |
As the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the
|
@@ -691,7 +824,9 @@ A: Let's think step by step.
|
|
691 |
(2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
|
692 |
(3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
|
693 |
At the end of the dance, Alice is dancing with Patrick. So the answer is (C).""",
|
694 |
-
|
|
|
|
|
695 |
|
696 |
Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.
|
697 |
As the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the
|
@@ -731,7 +866,9 @@ A: Let's think step by step.
|
|
731 |
(2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
|
732 |
(3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
|
733 |
At the end of the dance, Alice is dancing with Patrick. So the answer is (C).""",
|
734 |
-
|
|
|
|
|
735 |
|
736 |
Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.
|
737 |
Options:
|
@@ -768,7 +905,9 @@ A: Let's think step by step.
|
|
768 |
(3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
|
769 |
According to this ordering, the leftmost book is the white book.
|
770 |
The white book is the leftmost. So the answer is (C).""",
|
771 |
-
|
|
|
|
|
772 |
|
773 |
Q: Which sentence has the correct adjective order:
|
774 |
Options:
|
@@ -796,7 +935,9 @@ A: Let's think step by step.
|
|
796 |
When there is more than one adjective before a noun, the adjectives need to respect the following order before a noun: "[1. opinion] [2. size] [3. age] [4. shape] [5. color] [6. origin] [7. material] [8. purpose] noun".
|
797 |
Option (A): "blue gold wonderful square shoe". (1) "blue" falls into the color category. (2) "gold" falls into the material category. (3) "wonderful" falls into the opinion category. (4) "square" falls into the shape category. The adjective order that Option (A) has is [5. color] [7. material] [1. opinion] [4. shape] (or, in numeric terms, 5 7 1 4). Because 5 < 7 < 1 < 4 is not correct, (A) does not have the correct ordering.
|
798 |
Option (B): "wonderful square blue gold shoe". Option (B) has the following adjective order: [1. opinion] [4. shape] [5. color] [7. material] (or, in numeric terms, 1 4 5 7 ). Because 1 < 4 < 5 < 7 is correct, (B) has the correct ordering. So the answer is (B).""",
|
799 |
-
|
|
|
|
|
800 |
|
801 |
Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.
|
802 |
Options:
|
@@ -833,7 +974,9 @@ A: Let's think step by step.
|
|
833 |
(3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
|
834 |
According to this ordering, the leftmost book is the white book.
|
835 |
The white book is the leftmost. So the answer is (C).""",
|
836 |
-
|
|
|
|
|
837 |
|
838 |
Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.
|
839 |
Options:
|
@@ -870,7 +1013,9 @@ A: Let's think step by step.
|
|
870 |
(3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
|
871 |
According to this ordering, the leftmost book is the white book.
|
872 |
The white book is the leftmost. So the answer is (C).""",
|
873 |
-
|
|
|
|
|
874 |
|
875 |
Q: Find a movie similar to Star Wars Episode IV - A New Hope, Indiana Jones and the Last Crusade, Star Wars Episode V - The Empire Strikes Back, The Big Lebowski:
|
876 |
Options:
|
@@ -912,7 +1057,9 @@ A: Let's think step by step.
|
|
912 |
- Inside Out (animation, family, comedy; 2015)
|
913 |
- Forrest Gump (comedy, drama, romance; 1994)
|
914 |
These are all famous movies produced in the past few decades.Amongst all the options, the only movie similar to these ones seems to be Edge of Tomorrow (action, adventure, crime, mystery; 2014), as it is also a science-fiction movie and features Tom Cruise. So the answer is (D).""",
|
915 |
-
|
|
|
|
|
916 |
|
917 |
Q: The following translations from German to English contain a particular error. That error will be one of the following types: Named Entities: An entity (names, places, locations, etc.) is changed to a different entity. Numerical Values: Numerical values (ordinals or cardinals), dates, and/or units are changed. Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are changed. Negation or Antonyms: Introduce or remove a negation or change comparatives to their antonyms. Facts: Trivial factual errors not pertaining to the above classes are introduced in the translations. Dropped Content: A significant clause in the translation is removed. Please identify that error. Source: In der Liste der Baudenkmale in Lenzen (Elbe) sind alle Baudenkmale der brandenburgischen Stadt Lenzen (Elbe) und ihrer Ortsteile aufgelistet.
|
918 |
Translation: In the list of architectural monuments in Lenzen all architectural monuments of the Brandenburg city of Lenzen and its districts are listed.
|
@@ -952,7 +1099,9 @@ Options:
|
|
952 |
(F) Facts
|
953 |
A: Let's think step by step.
|
954 |
We solve this question by first translating the source sentence to English and then by comparing our translation with the provided translation. According to Google Translate, the correct translation of the source sentence from German to English is "Łeba is a small town and seaside resort in the Powiat Lęborski of the Polish Pomeranian Voivodeship." On the other hand, the provided translation is "Łeba is not a small town and seaside resort in the Powiat Léborski county of the Pomeranian Voivodeship of Poland." Note that the provided sentence says, "Łeba is not a small town ..." However, the translation should have been "Łeba is a small town ..." Because a negation is introduced at the beginning of the sentence and has fundamentally changed the meaning of the original source, the translation contains an error pertaining to Negation or Antonyms. So the answer is (C).""",
|
955 |
-
|
|
|
|
|
956 |
|
957 |
Q: On the nightstand, there is a red pencil, a purple mug, a burgundy keychain, a fuchsia teddy bear, a black plate, and a blue stress ball. What color is the stress ball?
|
958 |
Options:
|
@@ -1015,7 +1164,8 @@ A: Let's think step by step.
|
|
1015 |
According to this question, the objects are arranged in a row, from left to right, as follows: (1) a teal plate, (2) a burgundy keychain, (3) a yellow scrunchiephone charger, (4) an orange mug, (5) a pink notebook, (6) a grey cup.
|
1016 |
The teal plate is the first item, namely (1). There is no item to the left of the teal item.
|
1017 |
The number of non-orange items to the left of the teal item is zero. So the answer is (A).""",
|
1018 |
-
|
|
|
1019 |
|
1020 |
Q: ((-5 + 9 * -4 - 0) * (4 + -7 + 0 * -5)) =
|
1021 |
A: Let's think step by step.
|
@@ -1040,7 +1190,9 @@ This equation can be written as "A - B", where A = (-3 + 5 * 8 * -4) and B = (9
|
|
1040 |
Let's calculate A = (-3 + 5 * 8 * -4) = (-3 + (5 * 8) * -4) = (-3 + (40) * -4) = (-3 + (40 * -4)) = (-3 + -160) = -163.
|
1041 |
Let's calculate B = (9 - 8 * -7 + -9) = (9 - (8 * -7) + -9) = (9 - (-56) + -9) = ((9 - (-56)) + -9) = ((65) + -9)= (65 - 9) = 56.
|
1042 |
Then, the final equation is A - B = -163 - 56 = -219. So the answer is -219.""",
|
1043 |
-
|
|
|
|
|
1044 |
|
1045 |
Q: If you follow these instructions, do you return to the starting point? Turn left. Turn around. Turn left. Take 7 steps. Take 2 steps. Take 4 steps. Take 8 steps.
|
1046 |
Options:
|
@@ -1083,7 +1235,9 @@ We start at the origin (0, 0), facing the positive y-axis.
|
|
1083 |
(3) Take 9 steps left: (0, -7), facing the positive y-axis.
|
1084 |
(4) Take 7 steps right: (0, 7), facing the positive y-axis.
|
1085 |
Since (0, 0) is (0, 0), we are indeed where we started. So the answer is Yes.""",
|
1086 |
-
|
|
|
|
|
1087 |
|
1088 |
Q: Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: [ { [
|
1089 |
A: Let's think step by step.
|
@@ -1155,7 +1309,9 @@ We should process each input one by one and keep track of the stack configuratio
|
|
1155 |
Now, we have reached the end. The final stack is "< [ {".
|
1156 |
We will need to pop out "{", "[", "<" one by one in that order.
|
1157 |
So, we need "}", "]", ">". So the answer is } ] >.""",
|
1158 |
-
|
|
|
|
|
1159 |
|
1160 |
Q: Sort the following words alphabetically: List: oven costume counterpart
|
1161 |
A: Let's think step by step.
|
@@ -1172,7 +1328,9 @@ Q: Sort the following words alphabetically: List: newt arson parthia seismograph
|
|
1172 |
A: Let's think step by step.
|
1173 |
The first letter: "newt": "n" (14). "arson": "a" (1). "parthia": "p" (16). "seismography": "s" (19). "mugho": "m" (13). "aspect": "a" (1). "census": "c" (3). We now have: (1) ["arson" ? "aspect"] < (3) "census" < (13) "mugho" < (14) "newt" < (16) "parthia" < (19) "seismography". Now let's sort this subpart ["arson" ? "aspect"] by looking at their second letters.
|
1174 |
The second letter: "arson": "r" (18). "aspect": "s" (19). We now have: (18) "arson" < (19) "aspect". Hence, we have ["arson" < "aspect"] < "census" < "mugho" < "newt" < "parthia" < "seismography". So the answer is arson aspect census mugho newt parthia seismography.""",
|
1175 |
-
|
|
|
|
|
1176 |
|
1177 |
Q: Is the following sentence plausible? "Bam Adebayo scored a reverse layup in the Western Conference Finals."
|
1178 |
A: Let's think step by step. Bam Adebayo is an American basketball player. Scoring a reverse layup in the Western Conference Finals is part of the NBA Finals. So the answer is yes.
|
@@ -1182,7 +1340,9 @@ A: Let's think step by step. Santi Cazorla is a soccer player. Touchdown is part
|
|
1182 |
|
1183 |
Q: Is the following sentence plausible? "DeMar DeRozan was called for the goal tend."
|
1184 |
A: Let's think step by step. DeMar DeRozan is an American basketball player. Goal tending is part of basketball. So the answer is yes.""",
|
1185 |
-
|
|
|
|
|
1186 |
|
1187 |
Q: not ( ( not not True ) ) is
|
1188 |
A: Let's think step by step.
|
@@ -1205,7 +1365,9 @@ Remember that (i) expressions inside brackets are always evaluated first and tha
|
|
1205 |
We first simplify this expression "Z" as follows: "Z = not not ( not ( False ) ) = not not ( A )" where "A = not ( False )".
|
1206 |
Let's evaluate A: A = not ( False ) = not False = True.
|
1207 |
Plugging in A, we get: Z = not not ( A ) = not not (True) = not not False = True. So the answer is True.""",
|
1208 |
-
|
|
|
|
|
1209 |
|
1210 |
Q: I have a blackberry, a clarinet, a nectarine, a plum, a strawberry, a banana, a flute, an orange, and a violin. How many fruits do I have?
|
1211 |
A: Let's think step by step.
|
@@ -1242,7 +1404,9 @@ We first identify the vegetables on the list and include their quantity in paren
|
|
1242 |
- garlic (1)
|
1243 |
- yam (1)
|
1244 |
Now, let's add the numbers in parentheses: 1 + 1 + 1 + 1 + 2 + 1 + 1 = 8. So the answer is 8.""",
|
1245 |
-
|
|
|
|
|
1246 |
|
1247 |
Q: "It is not always easy to see who is related to whom -- and in which ways. The following argument pertains to this question: To begin with, Lesley is a close friend of Fernando. Moreover, being a close friend of Fernando or a schoolmate of Lowell is sufficient for being a great-grandfather of Leroy. It follows that Lesley is a great-grandfather of Leroy."
|
1248 |
Is the argument, given the explicitly stated premises, deductively valid or invalid?
|
@@ -1286,7 +1450,9 @@ By (1), we have if X = infrequent-user(Paul Mitchell), then X = rare-consumer(Ni
|
|
1286 |
The case X = rare-consumer(Nioxin) does not appear in (2).
|
1287 |
The case X = loyal-buyer(Caress) does not appear in (2), either.
|
1288 |
So, from (1) and (2), we cannot necessarily deduce the Hypothesis. So the answer is invalid.""",
|
1289 |
-
|
|
|
|
|
1290 |
|
1291 |
Q: How would a typical person answer each of the following questions about causation?
|
1292 |
Frank T., had an ongoing dispute with his neighbor over a stretch of land and one day decided to shoot his neighbor in the body. Frank T. had no experience with guns, his hand slipped on the barrel of the gun, and the shot went wild. Nonetheless, the bullet bounced off a large boulder several feet away and hit the neighbor's body, causing significant injury. Did Frank T. intentionally shoot his neighbor in the body?
|
@@ -1311,7 +1477,7 @@ Options:
|
|
1311 |
- No
|
1312 |
A: Let's think step by step.
|
1313 |
Here in this question, we are told that "He aims the dart at the low point region." A typical person might therefore think George did intentionally hit the low point region, because he wanted to lift up the spirit of his sister Lena. So the answer is Yes.""",
|
1314 |
-
|
1315 |
|
1316 |
Q: Question: Fidel tells the truth. Jerry says Fidel tells the truth. Vina says Jerry tells the truth. Millicent says Vina lies. Raymond says Millicent lies. Does Raymond tell the truth?
|
1317 |
A: Let's think step by step.
|
|
|
2 |
import re
|
3 |
import numpy as np
|
4 |
from typing import Any
|
|
|
|
|
5 |
|
6 |
NUMERIC_IN_EN = r"(?:[\s=+-/<>($:\.\*\\])(?=\S)((?:0|(?:\d{1,3}(?:,\d{3})+(?=\D|$))|(?:\d+))(?:\.\d+)?%?)(?:(?![^\s=+-/>)$:\.\*\\])|(?=, ))"
|
7 |
NUMERIC_IN_ZH = (
|
|
|
9 |
)
|
10 |
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
def extract_choice_zh(gen):
|
13 |
# 答案是A | 选项是A | 应该选A选项
|
14 |
res = re.search(
|
|
|
40 |
def extract_choice(gen):
|
41 |
# answer is A | choice is A | choose A
|
42 |
res = re.search(
|
43 |
+
r"(?:(?:[Cc]hoose)|(?:(?:[Aa]nswer|[Cc]hoice)(?![^ABCD]{0,20}?(?:n't|not))[^ABCD]{0,10}?\b(?:|is|:|be))\b)[^ABCD]{0,20}?\b(A|B|C|D)\b",
|
44 |
gen,
|
45 |
)
|
46 |
|
47 |
# A is correct | A is right
|
48 |
if res is None:
|
49 |
res = re.search(
|
50 |
+
r"\b(A|B|C|D)\b(?![^ABCD]{0,8}?(?:n't|not)[^ABCD]{0,5}?(?:correct|right))[^ABCD]{0,10}?\b(?:correct|right)\b",
|
51 |
gen,
|
52 |
)
|
53 |
|
54 |
# straight answer: A
|
55 |
if res is None:
|
56 |
+
res = re.search(r"^(A|B|C|D)(?:\.|,|:|$)", gen)
|
57 |
|
58 |
# simply extract the first appearred letter
|
59 |
if res is None:
|
60 |
+
res = re.search(r"(?<![a-zA-Z])(A|B|C|D)(?![a-zA-Z=])", gen)
|
61 |
|
62 |
if res is None:
|
63 |
+
res = "A"
|
64 |
|
65 |
if isinstance(res, str):
|
66 |
return res
|
|
|
102 |
|
103 |
|
104 |
def remove_boxed(s):
|
105 |
+
if "\\boxed " in s:
|
106 |
+
left = "\\boxed "
|
107 |
+
assert s[: len(left)] == left
|
108 |
+
return s[len(left) :]
|
109 |
+
|
110 |
+
left = "\\boxed{"
|
111 |
+
|
112 |
+
assert s[: len(left)] == left
|
113 |
+
assert s[-1] == "}"
|
114 |
|
115 |
+
return s[len(left) : -1]
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
|
118 |
def last_boxed_only_string(string):
|
|
|
145 |
return retval
|
146 |
|
147 |
|
148 |
+
def fix_sqrt(string):
|
149 |
+
if "\\sqrt" not in string:
|
150 |
+
return string
|
151 |
+
splits = string.split("\\sqrt")
|
152 |
+
new_string = splits[0]
|
153 |
+
for split in splits[1:]:
|
154 |
+
if split[0] != "{":
|
155 |
+
a = split[0]
|
156 |
+
new_substr = "\\sqrt{" + a + "}" + split[1:]
|
157 |
+
else:
|
158 |
+
new_substr = "\\sqrt" + split
|
159 |
+
new_string += new_substr
|
160 |
+
return new_string
|
161 |
+
|
162 |
+
|
163 |
+
def remove_right_units(string):
|
164 |
+
# "\\text{ " only ever occurs (at least in the val set) when describing units
|
165 |
+
if "\\text{ " in string:
|
166 |
+
splits = string.split("\\text{ ")
|
167 |
+
assert len(splits) == 2
|
168 |
+
return splits[0]
|
169 |
else:
|
170 |
+
return string
|
171 |
+
|
172 |
+
|
173 |
+
def fix_fracs(string):
|
174 |
+
substrs = string.split("\\frac")
|
175 |
+
new_str = substrs[0]
|
176 |
+
if len(substrs) > 1:
|
177 |
+
substrs = substrs[1:]
|
178 |
+
for substr in substrs:
|
179 |
+
new_str += "\\frac"
|
180 |
+
if substr[0] == "{":
|
181 |
+
new_str += substr
|
182 |
+
else:
|
183 |
+
try:
|
184 |
+
assert len(substr) >= 2
|
185 |
+
except AssertionError:
|
186 |
+
return string
|
187 |
+
a = substr[0]
|
188 |
+
b = substr[1]
|
189 |
+
if b != "{":
|
190 |
+
if len(substr) > 2:
|
191 |
+
post_substr = substr[2:]
|
192 |
+
new_str += "{" + a + "}{" + b + "}" + post_substr
|
193 |
+
else:
|
194 |
+
new_str += "{" + a + "}{" + b + "}"
|
195 |
+
else:
|
196 |
+
if len(substr) > 2:
|
197 |
+
post_substr = substr[2:]
|
198 |
+
new_str += "{" + a + "}" + b + post_substr
|
199 |
+
else:
|
200 |
+
new_str += "{" + a + "}" + b
|
201 |
+
string = new_str
|
202 |
+
return string
|
203 |
+
|
204 |
+
|
205 |
+
def fix_a_slash_b(string):
|
206 |
+
if len(string.split("/")) != 2:
|
207 |
+
return string
|
208 |
+
a = string.split("/")[0]
|
209 |
+
b = string.split("/")[1]
|
210 |
+
try:
|
211 |
+
a = int(a)
|
212 |
+
b = int(b)
|
213 |
+
assert string == "{}/{}".format(a, b)
|
214 |
+
new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
|
215 |
+
return new_string
|
216 |
+
except Exception as e:
|
217 |
+
return string
|
218 |
+
|
219 |
+
|
220 |
+
def strip_string(string):
|
221 |
+
# linebreaks
|
222 |
+
string = string.replace("\n", "")
|
223 |
+
|
224 |
+
# remove inverse spaces
|
225 |
+
string = string.replace("\\!", "")
|
226 |
+
|
227 |
+
# replace \\ with \
|
228 |
+
string = string.replace("\\\\", "\\")
|
229 |
+
|
230 |
+
# replace tfrac and dfrac with frac
|
231 |
+
string = string.replace("tfrac", "frac")
|
232 |
+
string = string.replace("dfrac", "frac")
|
233 |
+
|
234 |
+
# remove \left and \right
|
235 |
+
string = string.replace("\\left", "")
|
236 |
+
string = string.replace("\\right", "")
|
237 |
+
|
238 |
+
# Remove circ (degrees)
|
239 |
+
string = string.replace("^{\\circ}", "")
|
240 |
+
string = string.replace("^\\circ", "")
|
241 |
|
242 |
+
# remove dollar signs
|
243 |
+
string = string.replace("\\$", "")
|
244 |
+
|
245 |
+
# remove units (on the right)
|
246 |
+
string = remove_right_units(string)
|
247 |
+
|
248 |
+
# remove percentage
|
249 |
+
string = string.replace("\\%", "")
|
250 |
+
string = string.replace("\%", "") # noqa: W605
|
251 |
+
|
252 |
+
# " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
|
253 |
+
string = string.replace(" .", " 0.")
|
254 |
+
string = string.replace("{.", "{0.")
|
255 |
+
# if empty, return empty string
|
256 |
+
if len(string) == 0:
|
257 |
+
return string
|
258 |
+
if string[0] == ".":
|
259 |
+
string = "0" + string
|
260 |
+
|
261 |
+
# to consider: get rid of e.g. "k = " or "q = " at beginning
|
262 |
+
if len(string.split("=")) == 2:
|
263 |
+
if len(string.split("=")[0]) <= 2:
|
264 |
+
string = string.split("=")[1]
|
265 |
+
|
266 |
+
# fix sqrt3 --> sqrt{3}
|
267 |
+
string = fix_sqrt(string)
|
268 |
+
|
269 |
+
# remove spaces
|
270 |
+
string = string.replace(" ", "")
|
271 |
+
|
272 |
+
# \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
|
273 |
+
string = fix_fracs(string)
|
274 |
+
|
275 |
+
# manually change 0.5 --> \frac{1}{2}
|
276 |
+
if string == "0.5":
|
277 |
+
string = "\\frac{1}{2}"
|
278 |
+
|
279 |
+
# NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
|
280 |
+
# string = fix_a_slash_b(string)
|
281 |
+
|
282 |
+
return string
|
283 |
+
|
284 |
+
|
285 |
+
def get_answer(string):
|
286 |
+
try:
|
287 |
+
answer = remove_boxed(last_boxed_only_string(string))
|
288 |
+
# answer = strip_string(answer)
|
289 |
+
except Exception:
|
290 |
+
answer = string
|
291 |
+
return answer
|
292 |
+
|
293 |
+
|
294 |
+
def is_equiv(str1, str2, verbose=False):
|
295 |
+
if str1 is None and str2 is None:
|
296 |
+
print("WARNING: Both None")
|
297 |
+
return False
|
298 |
+
if str1 is None or str2 is None:
|
299 |
+
return False
|
300 |
+
|
301 |
+
try:
|
302 |
+
ss1 = strip_string(str1)
|
303 |
+
ss2 = strip_string(str2)
|
304 |
+
if verbose:
|
305 |
+
print(ss1, ss2)
|
306 |
+
return ss1 == ss2
|
307 |
+
except Exception:
|
308 |
+
return str1 == str2
|
309 |
|
310 |
def first_option_postprocess(text: str, options: str) -> str:
|
311 |
"""Find first valid option for text."""
|
312 |
|
313 |
patterns = [
|
314 |
+
f'[Tt]he answer is [{options}]',
|
315 |
+
f'[Tt]he correct answer\s?(?:option)?\s?is [{options}]', # noqa
|
316 |
+
f'答案(?:选项)?是(.*?)[{options}]',
|
317 |
+
f'答案(?:选项)?为(.*?)[{options}]',
|
318 |
+
f'答案(?:选项)?选(.*?)[{options}]',
|
319 |
+
f'选项[{options}]是?正确',
|
320 |
+
f'选项[{options}]为?正确',
|
321 |
+
f'固选(.*?)[{options}]',
|
322 |
+
f'答案应该是(.*?)[{options}]',
|
323 |
+
f'(\s|^)[{options}][\s。,,\.$]', # noqa
|
324 |
+
f'[{options}]',
|
325 |
]
|
326 |
|
327 |
regexes = [re.compile(pattern) for pattern in patterns]
|
|
|
332 |
for i in options:
|
333 |
if i in outputs:
|
334 |
return i
|
335 |
+
return ''
|
336 |
|
337 |
|
338 |
def first_capital_postprocess(text: str) -> str:
|
339 |
for t in text:
|
340 |
if t.isupper():
|
341 |
return t
|
342 |
+
return ''
|
343 |
|
344 |
|
345 |
def general_postprocess(text: str) -> str:
|
346 |
# Cut off the first newline, period, or comma
|
347 |
+
truncated_text = re.split(r'[\n.,]', text, 1)[0]
|
348 |
|
349 |
# Remove punctuation
|
350 |
+
no_punctuation = re.sub(r'[^\w\s]', '', truncated_text)
|
351 |
|
352 |
# Remove article
|
353 |
+
no_articles = re.sub(r'\b(a|an|the)\b',
|
354 |
+
'',
|
355 |
+
no_punctuation,
|
356 |
+
flags=re.IGNORECASE)
|
357 |
|
358 |
# Remove duplicated blank spaces
|
359 |
+
cleaned_text = re.sub(r'\s+', ' ', no_articles).strip()
|
360 |
|
361 |
return cleaned_text
|
362 |
|
363 |
+
|
364 |
def bbh_mcq_postprocess(text: str) -> str:
|
365 |
ans = text
|
366 |
+
ans_line = ans.split('answer is ')
|
367 |
if len(ans_line) != 1:
|
368 |
ans = ans_line[-1].strip()
|
369 |
+
match = re.search(r'\(([A-Z])\)*', ans)
|
370 |
if match:
|
371 |
return f"({match.group(1)})"
|
372 |
+
match = re.search(r'([A-Z])', ans)
|
373 |
if match:
|
374 |
return f"({match.group(1)})"
|
375 |
return f"({ans})"
|
|
|
377 |
|
378 |
def bbh_freeform_postprocess(text: str) -> str:
|
379 |
ans = text
|
380 |
+
ans_line = ans.split('answer is ')
|
381 |
if len(ans_line) != 1:
|
382 |
ans = ans_line[-1].strip()
|
383 |
+
ans = ans.split('\n')[0]
|
384 |
+
if ans.endswith('.'):
|
385 |
ans = ans[:-1]
|
386 |
return ans
|
387 |
|
388 |
|
389 |
+
|
390 |
icl_prompts = {
|
391 |
+
'temporal_sequences': """Task description: Answer questions about which times certain events could have occurred.
|
392 |
|
393 |
Q: Today, Emily went to the museum. Between what times could they have gone?
|
394 |
We know that:
|
|
|
465 |
3pm-4pm: free.
|
466 |
The beach closure time: 4pm.
|
467 |
The only time when Tiffany could have gone to the beach was 3pm to 4pm. So the answer is (D).""",
|
468 |
+
|
469 |
+
|
470 |
+
'disambiguation_qa': """Clarify the meaning of sentences with ambiguous pronouns.
|
471 |
|
472 |
Q: In the following sentences, explain the antecedent of the pronoun (which thing the pronoun refers to), or state that it is ambiguous.
|
473 |
Sentence: The chief told the counselor that they took the day off.
|
|
|
504 |
Let's consider Y first: "X will plan to meet Y at Y's office." This case makes sense, because X might want to meet up with Y at Y's office.
|
505 |
Now, consider X: "X will plan to meet Y at X's office." This case also makes sense, because X might want to meet up with Y at X's own office.
|
506 |
Because both X and Y are possible at the same time, we conclude that the antecedent of the pronoun is ambiguous. So the answer is (C).""",
|
507 |
+
|
508 |
+
|
509 |
+
'date_understanding': """Infer the date from context.
|
510 |
|
511 |
Q: Today is Christmas Eve of 1937. What is the date 10 days ago in MM/DD/YYYY?
|
512 |
Options:
|
|
|
539 |
(F) 12/03/1960
|
540 |
A: Let's think step by step.
|
541 |
If Jane and John married on Jan 2, 1958, then and if it is their 5-year anniversary today, then today's date is Jan 2, 1963. The date tomorrow is Jan 3, 1963, that is 01/03/1963. So the answer is (B).""",
|
542 |
+
|
543 |
+
|
544 |
+
'tracking_shuffled_objects_three_objects': """A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.
|
545 |
|
546 |
Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.
|
547 |
As the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the
|
|
|
581 |
(2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
|
582 |
(3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
|
583 |
At the end of the dance, Alice is dancing with Patrick. So the answer is (C).""",
|
584 |
+
|
585 |
+
|
586 |
+
'penguins_in_a_table': """Answer questions about a table of penguins and their attributes.
|
587 |
|
588 |
Q: Here is a table where the first line is a header and each subsequent line is a penguin: name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15 For example: the age of Louis is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm. We now add a penguin to the table:
|
589 |
James, 12, 90, 12
|
|
|
624 |
When we sort their names alphabetically, we get Bernard, Gwen, Louis, Vincent.
|
625 |
The name of the second penguin sorted by alphabetical order is Gwen.
|
626 |
The name of the second penguin sorted by alphabetic order is Gwen. So the answer is (D).""",
|
627 |
+
|
628 |
+
|
629 |
+
'geometric_shapes': """Name geometric shapes from their SVG paths.
|
630 |
|
631 |
Q: This SVG path element <path d="M 31.00,73.00 L 32.00,59.00 L 44.00,50.00 L 49.00,41.00 L 64.00,37.00 L 71.00,55.00 L 64.00,76.00 L 52.00,61.00 L 31.00,73.00"/> draws a
|
632 |
Options:
|
|
|
704 |
Length of side C: |C| = sqrt((41.00-45.00)^2 + (33.00-34.00)^2)) = sqrt((-4)^2 + (-1)^2) = sqrt(16 + 1) = sqrt(17).
|
705 |
Length of side D: |D| = sqrt((45.00-41.00)^2 + (34.00-43.00)^2)) = sqrt((4)^2 + (-9)^2) = sqrt(16 + 81) = sqrt(97).
|
706 |
Note that |A| = |D| and |B| = |C|. Furthermore, A and D are adjacent and B and C are adjacent. Thus, this polygon has two pairs of equal-length adjacent sides and is "kite". So the answer is (D).""",
|
707 |
+
|
708 |
+
|
709 |
+
'snarks': """Determine which of two sentences is sarcastic.
|
710 |
|
711 |
According to Cambridge University Dictionary, sarcasm is "the use of remarks that clearly mean the opposite of what they say, made in order to hurt someone's feelings or to criticize something in a humorous way." Sarcastic sentences often contain satirical or ironic utterances, hyperboles, ambivalent or witty remarks.
|
712 |
|
|
|
736 |
If we look at (A), it likens the consistency in the league's punishments with that in politics. Because politics or political affairs are often not considered to be consistent or dependable, this sentence appears to be satirical.
|
737 |
If we look at (B), it likens the consistency in the league's punishments with that in morality. Discussing the consistency of the league's punishments in the context of morality, ethics, or law makes sense and does not appear to make a satirical point about anything.
|
738 |
Above the above, the sarcastic option is (A). So the answer is (A).""",
|
739 |
+
|
740 |
+
|
741 |
+
'ruin_names': """Select the humorous edit that 'ruins' the input movie or musical artist name.
|
742 |
|
743 |
Q: Which of the following is a humorous edit of this artist or movie name: 'whitesnake'?
|
744 |
Options:
|
|
|
782 |
(C) "courting crows": Here the word "counting" is changed to "courting", and "courting" is an actual word; however, "courting crows" does not sound as humorous as "counting cows".
|
783 |
(D) "coutnting crows": Here the word "counting" is changed to "coutnting", but the word "coutnting" is not an actual word.
|
784 |
Above the above, the only humorous edit is (B). So the answer is (B).""",
|
785 |
+
|
786 |
+
|
787 |
+
'tracking_shuffled_objects_seven_objects': """A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.
|
788 |
|
789 |
Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.
|
790 |
As the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the
|
|
|
824 |
(2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
|
825 |
(3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
|
826 |
At the end of the dance, Alice is dancing with Patrick. So the answer is (C).""",
|
827 |
+
|
828 |
+
|
829 |
+
'tracking_shuffled_objects_five_objects': """A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.
|
830 |
|
831 |
Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.
|
832 |
As the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the
|
|
|
866 |
(2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
|
867 |
(3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
|
868 |
At the end of the dance, Alice is dancing with Patrick. So the answer is (C).""",
|
869 |
+
|
870 |
+
|
871 |
+
'logical_deduction_three_objects': """A logical deduction task which requires deducing the order of a sequence of objects.
|
872 |
|
873 |
Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.
|
874 |
Options:
|
|
|
905 |
(3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
|
906 |
According to this ordering, the leftmost book is the white book.
|
907 |
The white book is the leftmost. So the answer is (C).""",
|
908 |
+
|
909 |
+
|
910 |
+
'hyperbaton': """Order adjectives correctly in English sentences.
|
911 |
|
912 |
Q: Which sentence has the correct adjective order:
|
913 |
Options:
|
|
|
935 |
When there is more than one adjective before a noun, the adjectives need to respect the following order before a noun: "[1. opinion] [2. size] [3. age] [4. shape] [5. color] [6. origin] [7. material] [8. purpose] noun".
|
936 |
Option (A): "blue gold wonderful square shoe". (1) "blue" falls into the color category. (2) "gold" falls into the material category. (3) "wonderful" falls into the opinion category. (4) "square" falls into the shape category. The adjective order that Option (A) has is [5. color] [7. material] [1. opinion] [4. shape] (or, in numeric terms, 5 7 1 4). Because 5 < 7 < 1 < 4 is not correct, (A) does not have the correct ordering.
|
937 |
Option (B): "wonderful square blue gold shoe". Option (B) has the following adjective order: [1. opinion] [4. shape] [5. color] [7. material] (or, in numeric terms, 1 4 5 7 ). Because 1 < 4 < 5 < 7 is correct, (B) has the correct ordering. So the answer is (B).""",
|
938 |
+
|
939 |
+
|
940 |
+
'logical_deduction_five_objects': """A logical deduction task which requires deducing the order of a sequence of objects.
|
941 |
|
942 |
Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.
|
943 |
Options:
|
|
|
974 |
(3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
|
975 |
According to this ordering, the leftmost book is the white book.
|
976 |
The white book is the leftmost. So the answer is (C).""",
|
977 |
+
|
978 |
+
|
979 |
+
'logical_deduction_seven_objects': """A logical deduction task which requires deducing the order of a sequence of objects.
|
980 |
|
981 |
Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.
|
982 |
Options:
|
|
|
1013 |
(3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
|
1014 |
According to this ordering, the leftmost book is the white book.
|
1015 |
The white book is the leftmost. So the answer is (C).""",
|
1016 |
+
|
1017 |
+
|
1018 |
+
'movie_recommendation': """Recommend movies similar to the given list of movies.
|
1019 |
|
1020 |
Q: Find a movie similar to Star Wars Episode IV - A New Hope, Indiana Jones and the Last Crusade, Star Wars Episode V - The Empire Strikes Back, The Big Lebowski:
|
1021 |
Options:
|
|
|
1057 |
- Inside Out (animation, family, comedy; 2015)
|
1058 |
- Forrest Gump (comedy, drama, romance; 1994)
|
1059 |
These are all famous movies produced in the past few decades.Amongst all the options, the only movie similar to these ones seems to be Edge of Tomorrow (action, adventure, crime, mystery; 2014), as it is also a science-fiction movie and features Tom Cruise. So the answer is (D).""",
|
1060 |
+
|
1061 |
+
|
1062 |
+
'salient_translation_error_detection': """Detect the type of error in an English translation of a German source sentence.
|
1063 |
|
1064 |
Q: The following translations from German to English contain a particular error. That error will be one of the following types: Named Entities: An entity (names, places, locations, etc.) is changed to a different entity. Numerical Values: Numerical values (ordinals or cardinals), dates, and/or units are changed. Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are changed. Negation or Antonyms: Introduce or remove a negation or change comparatives to their antonyms. Facts: Trivial factual errors not pertaining to the above classes are introduced in the translations. Dropped Content: A significant clause in the translation is removed. Please identify that error. Source: In der Liste der Baudenkmale in Lenzen (Elbe) sind alle Baudenkmale der brandenburgischen Stadt Lenzen (Elbe) und ihrer Ortsteile aufgelistet.
|
1065 |
Translation: In the list of architectural monuments in Lenzen all architectural monuments of the Brandenburg city of Lenzen and its districts are listed.
|
|
|
1099 |
(F) Facts
|
1100 |
A: Let's think step by step.
|
1101 |
We solve this question by first translating the source sentence to English and then by comparing our translation with the provided translation. According to Google Translate, the correct translation of the source sentence from German to English is "Łeba is a small town and seaside resort in the Powiat Lęborski of the Polish Pomeranian Voivodeship." On the other hand, the provided translation is "Łeba is not a small town and seaside resort in the Powiat Léborski county of the Pomeranian Voivodeship of Poland." Note that the provided sentence says, "Łeba is not a small town ..." However, the translation should have been "Łeba is a small town ..." Because a negation is introduced at the beginning of the sentence and has fundamentally changed the meaning of the original source, the translation contains an error pertaining to Negation or Antonyms. So the answer is (C).""",
|
1102 |
+
|
1103 |
+
|
1104 |
+
'reasoning_about_colored_objects': """Answer extremely simple questions about the colors of objects on a surface.
|
1105 |
|
1106 |
Q: On the nightstand, there is a red pencil, a purple mug, a burgundy keychain, a fuchsia teddy bear, a black plate, and a blue stress ball. What color is the stress ball?
|
1107 |
Options:
|
|
|
1164 |
According to this question, the objects are arranged in a row, from left to right, as follows: (1) a teal plate, (2) a burgundy keychain, (3) a yellow scrunchiephone charger, (4) an orange mug, (5) a pink notebook, (6) a grey cup.
|
1165 |
The teal plate is the first item, namely (1). There is no item to the left of the teal item.
|
1166 |
The number of non-orange items to the left of the teal item is zero. So the answer is (A).""",
|
1167 |
+
|
1168 |
+
'multistep_arithmetic_two': """Solve multi-step arithmetic problems.
|
1169 |
|
1170 |
Q: ((-5 + 9 * -4 - 0) * (4 + -7 + 0 * -5)) =
|
1171 |
A: Let's think step by step.
|
|
|
1190 |
Let's calculate A = (-3 + 5 * 8 * -4) = (-3 + (5 * 8) * -4) = (-3 + (40) * -4) = (-3 + (40 * -4)) = (-3 + -160) = -163.
|
1191 |
Let's calculate B = (9 - 8 * -7 + -9) = (9 - (8 * -7) + -9) = (9 - (-56) + -9) = ((9 - (-56)) + -9) = ((65) + -9)= (65 - 9) = 56.
|
1192 |
Then, the final equation is A - B = -163 - 56 = -219. So the answer is -219.""",
|
1193 |
+
|
1194 |
+
|
1195 |
+
'navigate': """Given a series of navigation instructions, determine whether one would end up back at the starting point.
|
1196 |
|
1197 |
Q: If you follow these instructions, do you return to the starting point? Turn left. Turn around. Turn left. Take 7 steps. Take 2 steps. Take 4 steps. Take 8 steps.
|
1198 |
Options:
|
|
|
1235 |
(3) Take 9 steps left: (0, -7), facing the positive y-axis.
|
1236 |
(4) Take 7 steps right: (0, 7), facing the positive y-axis.
|
1237 |
Since (0, 0) is (0, 0), we are indeed where we started. So the answer is Yes.""",
|
1238 |
+
|
1239 |
+
|
1240 |
+
'dyck_languages': """Correctly close a Dyck-n word.
|
1241 |
|
1242 |
Q: Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: [ { [
|
1243 |
A: Let's think step by step.
|
|
|
1309 |
Now, we have reached the end. The final stack is "< [ {".
|
1310 |
We will need to pop out "{", "[", "<" one by one in that order.
|
1311 |
So, we need "}", "]", ">". So the answer is } ] >.""",
|
1312 |
+
|
1313 |
+
|
1314 |
+
'word_sorting': """Sort a list of words.
|
1315 |
|
1316 |
Q: Sort the following words alphabetically: List: oven costume counterpart
|
1317 |
A: Let's think step by step.
|
|
|
1328 |
A: Let's think step by step.
|
1329 |
The first letter: "newt": "n" (14). "arson": "a" (1). "parthia": "p" (16). "seismography": "s" (19). "mugho": "m" (13). "aspect": "a" (1). "census": "c" (3). We now have: (1) ["arson" ? "aspect"] < (3) "census" < (13) "mugho" < (14) "newt" < (16) "parthia" < (19) "seismography". Now let's sort this subpart ["arson" ? "aspect"] by looking at their second letters.
|
1330 |
The second letter: "arson": "r" (18). "aspect": "s" (19). We now have: (18) "arson" < (19) "aspect". Hence, we have ["arson" < "aspect"] < "census" < "mugho" < "newt" < "parthia" < "seismography". So the answer is arson aspect census mugho newt parthia seismography.""",
|
1331 |
+
|
1332 |
+
|
1333 |
+
'sports_understanding': """Determine whether an artificially constructed sentence relating to sports is plausible or not.
|
1334 |
|
1335 |
Q: Is the following sentence plausible? "Bam Adebayo scored a reverse layup in the Western Conference Finals."
|
1336 |
A: Let's think step by step. Bam Adebayo is an American basketball player. Scoring a reverse layup in the Western Conference Finals is part of the NBA Finals. So the answer is yes.
|
|
|
1340 |
|
1341 |
Q: Is the following sentence plausible? "DeMar DeRozan was called for the goal tend."
|
1342 |
A: Let's think step by step. DeMar DeRozan is an American basketball player. Goal tending is part of basketball. So the answer is yes.""",
|
1343 |
+
|
1344 |
+
|
1345 |
+
'boolean_expressions': """Evaluate the result of a random Boolean expression.
|
1346 |
|
1347 |
Q: not ( ( not not True ) ) is
|
1348 |
A: Let's think step by step.
|
|
|
1365 |
We first simplify this expression "Z" as follows: "Z = not not ( not ( False ) ) = not not ( A )" where "A = not ( False )".
|
1366 |
Let's evaluate A: A = not ( False ) = not False = True.
|
1367 |
Plugging in A, we get: Z = not not ( A ) = not not (True) = not not False = True. So the answer is True.""",
|
1368 |
+
|
1369 |
+
|
1370 |
+
'object_counting': """Questions that involve enumerating objects and asking the model to count them.
|
1371 |
|
1372 |
Q: I have a blackberry, a clarinet, a nectarine, a plum, a strawberry, a banana, a flute, an orange, and a violin. How many fruits do I have?
|
1373 |
A: Let's think step by step.
|
|
|
1404 |
- garlic (1)
|
1405 |
- yam (1)
|
1406 |
Now, let's add the numbers in parentheses: 1 + 1 + 1 + 1 + 2 + 1 + 1 = 8. So the answer is 8.""",
|
1407 |
+
|
1408 |
+
|
1409 |
+
'formal_fallacies': """Distinguish deductively valid arguments from formal fallacies.
|
1410 |
|
1411 |
Q: "It is not always easy to see who is related to whom -- and in which ways. The following argument pertains to this question: To begin with, Lesley is a close friend of Fernando. Moreover, being a close friend of Fernando or a schoolmate of Lowell is sufficient for being a great-grandfather of Leroy. It follows that Lesley is a great-grandfather of Leroy."
|
1412 |
Is the argument, given the explicitly stated premises, deductively valid or invalid?
|
|
|
1450 |
The case X = rare-consumer(Nioxin) does not appear in (2).
|
1451 |
The case X = loyal-buyer(Caress) does not appear in (2), either.
|
1452 |
So, from (1) and (2), we cannot necessarily deduce the Hypothesis. So the answer is invalid.""",
|
1453 |
+
|
1454 |
+
|
1455 |
+
'causal_judgement': """Answer questions about causal attribution.
|
1456 |
|
1457 |
Q: How would a typical person answer each of the following questions about causation?
|
1458 |
Frank T., had an ongoing dispute with his neighbor over a stretch of land and one day decided to shoot his neighbor in the body. Frank T. had no experience with guns, his hand slipped on the barrel of the gun, and the shot went wild. Nonetheless, the bullet bounced off a large boulder several feet away and hit the neighbor's body, causing significant injury. Did Frank T. intentionally shoot his neighbor in the body?
|
|
|
1477 |
- No
|
1478 |
A: Let's think step by step.
|
1479 |
Here in this question, we are told that "He aims the dart at the low point region." A typical person might therefore think George did intentionally hit the low point region, because he wanted to lift up the spirit of his sister Lena. So the answer is Yes.""",
|
1480 |
+
'web_of_lies': """Evaluate a random boolean function expressed as a word problem.
|
1481 |
|
1482 |
Q: Question: Fidel tells the truth. Jerry says Fidel tells the truth. Vina says Jerry tells the truth. Millicent says Vina lies. Raymond says Millicent lies. Does Raymond tell the truth?
|
1483 |
A: Let's think step by step.
|