Files changed (7) hide show
  1. .gitignore +1 -5
  2. README.md +4 -9
  3. index.html +0 -0
  4. pyproject.toml +0 -14
  5. tasks.py +164 -188
  6. tlem.py +53 -142
  7. utils.py +278 -112
.gitignore CHANGED
@@ -1,6 +1,2 @@
1
  __pycache__
2
- *.ju.py
3
- tests
4
-
5
- README_files
6
- .ipynb_checkpoints
 
1
  __pycache__
2
+ tlem.ju.py
 
 
 
 
README.md CHANGED
@@ -1,10 +1,5 @@
1
- ---
2
- title: TLEM
3
- emoji: 🐢
4
- colorFrom: yellow
5
- colorTo: purple
6
- sdk: static
7
- pinned: false
8
- ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
+
3
+ # Transparent LLMs Evaluation Metrics
4
+
5
+ > LLMs belong to *tout le monde*
index.html CHANGED
The diff for this file is too large to render. See raw diff
 
pyproject.toml DELETED
@@ -1,14 +0,0 @@
1
- [tool.poetry]
2
- name = "tlem"
3
- version = "0.1.0"
4
- description = ""
5
- authors = ["fecet <[email protected]>"]
6
- readme = "README.md"
7
-
8
- [tool.poetry.dependencies]
9
- python = "^3.10"
10
-
11
-
12
- [build-system]
13
- requires = ["poetry-core"]
14
- build-backend = "poetry.core.masonry.api"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tasks.py CHANGED
@@ -14,7 +14,6 @@ from .utils import *
14
  from evaluate import load
15
  from collections import defaultdict
16
  import sys
17
- from pathlib import Path
18
 
19
 
20
  # if sys.version_info >= (3, 9):
@@ -59,15 +58,13 @@ class Task:
59
  dataset_name: str | tuple[str, str] = ("gsm8k", "main")
60
  split: str = "test"
61
  # metrics: list[str] = field(default_factory=list)
62
- metric_name: str | tuple[str, str] = ("sustech/tlem", "mmlu")
63
  input_column: str = "question"
64
  label_column: str = ""
65
- output_column: str = "generated_text"
66
  prompt: Optional[Callable | str] = None
67
  few_shot: int = 0
68
  few_shot_from: Optional[str] = None
69
  # results: dict[str, Any] = field(default_factory=dict)
70
- # outputs: Optional[list] = field(default_factory=list)
71
 
72
  def __post_init__(self):
73
  names = (
@@ -75,7 +72,7 @@ class Task:
75
  if isinstance(self.dataset_name, str)
76
  else list(self.dataset_name)
77
  )
78
- names[0] = Path(names[0]).name
79
 
80
  self.name = "-".join(names) + f"-{self.split}"
81
  if isinstance(self.prompt, str):
@@ -87,21 +84,10 @@ class Task:
87
  }
88
  self.label_column = self.label_column or self.input_column
89
 
90
- def __eq__(self, __value: object) -> bool:
91
- return self.name == __value.name
92
-
93
  @cached_property
94
  def samples(self):
95
  return self.dataset[self.input_column]
96
 
97
- @cached_property
98
- def labels(self):
99
- return self.dataset[self.label_column]
100
-
101
- @cached_property
102
- def outputs(self):
103
- return self.dataset[self.output_column]
104
-
105
  @cached_property
106
  def dataset(self):
107
  ds = (
@@ -132,7 +118,6 @@ class Task:
132
  shots = shots.map(
133
  lambda example: {
134
  self.input_column: example[self.input_column]
135
- + "\n"
136
  + example[self.label_column],
137
  }
138
  )[self.input_column]
@@ -155,37 +140,33 @@ class Task:
155
  if isinstance(self.metric_name, str)
156
  else load(*self.metric_name)
157
  )
158
- return metric._compute
159
-
160
- @property
161
- def result(self) -> dict:
162
- assert self.outputs, "Please run the task first."
163
- results = self.metric(self.outputs, self.labels)
164
- # logging.info(f"{self.name}:{results}")
165
- return results
166
 
 
167
  def run(
168
  self,
169
  pipeline,
170
  ):
171
- if self.output_column not in self.dataset.column_names:
172
- self.dataset = self.dataset.add_column(
173
- self.output_column, pipeline(self.samples)
174
- )
175
-
176
- return self.result
177
-
178
- async def arun(self, pipeline):
179
- self.dataset = self.dataset.add_column(
180
- self.output_column, await pipeline(self.samples)
181
- )
182
-
183
- return self.result
 
 
 
 
 
184
 
185
- def save(self, path):
186
- self.dataset.select_columns(
187
- [self.input_column, self.output_column, self.label_column]
188
- ).save_to_disk(path)
189
 
190
 
191
  def multichoice(responses: Any, references: list[str]):
@@ -209,8 +190,10 @@ def multichoice_zh(responses: Any, references: list[str]):
209
  class Metrics:
210
  cmmlu = multichoice_zh
211
  mmlu = multichoice
212
- truthful_qa_mc1 = multichoice
213
- ceval = multichoice_zh
 
 
214
 
215
  def winogrande(responses: list[str], answers: list[str | int]):
216
  responses = [first_option_postprocess(pred, options="AB") for pred in responses]
@@ -235,11 +218,18 @@ class Metrics:
235
  return responses, answers
236
 
237
  def drop(responses: list[str], answers: list[list]):
238
- scores = []
239
- for pred, ans in zip(responses, answers):
240
- score = np.mean([1 if a in pred else 0 for a in ans])
241
- scores.append(score)
242
- return {"em": np.mean(scores)}
 
 
 
 
 
 
 
243
 
244
  def bbh_mcq(responses: list[str], answers: list[str | int]):
245
  if len(responses) != len(answers):
@@ -270,25 +260,17 @@ class Metrics:
270
 
271
  return responses, answers
272
 
273
- def boolq(responses: list[str], answers: list[str | int]):
274
-
275
- responses = [first_capital_postprocess(response) for response in responses]
276
- answers = ["A" if answer else "B" for answer in answers]
277
-
278
- return responses, answers
279
-
280
  def MATH(responses: list[str], answers: list[str]):
281
- extract_responses = sync_pipe(get_answer)(responses)
282
- extract_answers = sync_pipe(get_answer)(answers)
283
- try:
284
- from math_equivalence import is_equiv
285
- except ImportError as e:
286
- logging.error(
287
- "math_equivalence not installed, pip install git+https://github.com/hendrycks/math.git"
288
- )
289
- raise e
290
-
291
- return sync_pipe(is_equiv)(zip(extract_responses, extract_answers))
292
 
293
 
294
  class CMMLU:
@@ -301,7 +283,7 @@ class CMMLU:
301
  for choice in list("ABCD"):
302
  prompt += f"\n{choice}. {example[choice]}"
303
 
304
- prompt += "\n答案:"
305
  return {"prompt": prompt}
306
 
307
  subcategories = {
@@ -398,6 +380,7 @@ class CMMLU:
398
  ],
399
  "Other": ["other"],
400
  "China specific": ["china specific"],
 
401
  }
402
 
403
  @classmethod
@@ -624,29 +607,73 @@ class DROP:
624
  input_column = "input"
625
  label_column = "answers"
626
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
627
  @classmethod
628
  def prompt_drop(cls, example):
629
- prompt = f"Read the following passage and answer the question.\n\n{example['passage']}\n\nQuestion: {example['question']}"
 
 
630
 
631
- return {
632
- cls.input_column: prompt,
633
- cls.label_column: ",".join(example["answers_spans"]["spans"]),
634
- }
 
 
 
 
 
 
 
 
 
 
 
 
635
 
636
  @classmethod
637
  def suite(
638
  cls,
639
  ):
640
- return Task(
641
- "drop",
642
- metric_name=("sustech/tlem", "drop"),
643
- input_column=cls.input_column,
644
- label_column=cls.label_column,
645
- prompt=partial(cls.prompt_drop),
646
- few_shot=3,
647
- few_shot_from="train",
648
- split="validation",
649
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
650
 
651
 
652
  class HellaSwag:
@@ -717,17 +744,28 @@ class ARC:
717
 
718
  @classmethod
719
  def suite(cls):
720
- suite = [
721
- Task(
722
- ("ai2_arc", subset),
723
- metric_name=("sustech/tlem", "arc"),
724
- input_column=cls.input_column,
725
- label_column=cls.label_column,
726
- prompt=partial(cls.prompt_arc),
727
- few_shot=0,
 
 
 
 
 
 
 
 
 
 
 
 
 
728
  )
729
- for subset in cls.categories
730
- ]
731
 
732
  return suite
733
 
@@ -789,107 +827,45 @@ class BBH:
789
  def suite(
790
  cls,
791
  ):
792
- suite = []
793
- for cate in cls.bbh_multiple_choice_sets:
794
- suite.append(
795
- Task(
796
- ("lukaemon/bbh", cate),
797
- metric_name=("sustech/tlem", "bbh_mcq"),
798
- input_column=cls.input_column,
799
- label_column=cls.label_column,
800
- prompt=partial(cls.prompt_bbh, category=cate),
801
- few_shot=0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
802
  )
803
- )
804
- for cate in cls.bbh_free_form_sets:
805
- suite.append(
806
- Task(
807
- ("lukaemon/bbh", cate),
808
- metric_name=("sustech/tlem", "bbh_freefrom"),
809
- input_column=cls.input_column,
810
- label_column=cls.label_column,
811
- prompt=partial(cls.prompt_bbh, category=cate),
812
- few_shot=0,
813
  )
814
- )
815
-
816
- return suite
817
-
818
-
819
- class BoolQ:
820
- input_column = "input"
821
- label_column = "answer"
822
-
823
- @classmethod
824
- def prompt_boolq(cls, example, chat=False):
825
-
826
- prompt = f"{example['passage']}\nQuestion: {example['question']}\nA. Yes\nB. No\nAnswer: "
827
-
828
- return {"input": prompt}
829
-
830
- @classmethod
831
- def suite(cls, chat: bool):
832
-
833
- suite = [
834
- Task(
835
- dataset_name="boolq",
836
- metric_name=("sustech/tlem", "boolq"),
837
- input_column=cls.input_column,
838
- label_column=cls.label_column,
839
- prompt=partial(cls.prompt_boolq, chat=chat),
840
- few_shot=0 if chat else 5,
841
- few_shot_from="train",
842
- split="validation",
843
- )
844
- ]
845
 
846
  return suite
847
-
848
- class TruthfulQAMC1:
849
- input_column = "input"
850
- label_column = "answer"
851
-
852
- @classmethod
853
- def prompt_truthful_qa(cls, example):
854
-
855
- target = example["mc1_targets"]
856
- choices = target["choices"]
857
- labels = target["labels"]
858
-
859
- prompt = f"The following is a multiple-choice question. Please choose the most suitable one as the answer to this question.\n\n"
860
- prompt += example["question"]
861
-
862
- answer = []
863
-
864
- for idx, choice, label in zip(list("ABCDEFGHIJ")[:len(choices)], choices, labels):
865
-
866
- prompt += f"\n{idx}. {choice}"
867
-
868
- if label == 1:
869
- answer = idx
870
-
871
- prompt += "\nAnswer: "
872
-
873
- return {
874
- "input": prompt,
875
- "answer": answer
876
- }
877
-
878
- @classmethod
879
- def suite(cls):
880
- suite = [
881
- Task(
882
- dataset_name=("truthful_qa", "multiple_choice"),
883
- metric_name=("sustech/tlem", "truthful_qa_mc1"),
884
- input_column=cls.input_column,
885
- label_column=cls.label_column,
886
- prompt=partial(cls.prompt_truthful_qa),
887
- few_shot=0,
888
- split="validation",
889
- )
890
- ]
891
 
892
- return suite
893
 
894
  class CEVAL:
895
  input_column = "input"
 
14
  from evaluate import load
15
  from collections import defaultdict
16
  import sys
 
17
 
18
 
19
  # if sys.version_info >= (3, 9):
 
58
  dataset_name: str | tuple[str, str] = ("gsm8k", "main")
59
  split: str = "test"
60
  # metrics: list[str] = field(default_factory=list)
61
+ metric_name: str | tuple[str, str] = ("sustech/tlem", "gsm8k")
62
  input_column: str = "question"
63
  label_column: str = ""
 
64
  prompt: Optional[Callable | str] = None
65
  few_shot: int = 0
66
  few_shot_from: Optional[str] = None
67
  # results: dict[str, Any] = field(default_factory=dict)
 
68
 
69
  def __post_init__(self):
70
  names = (
 
72
  if isinstance(self.dataset_name, str)
73
  else list(self.dataset_name)
74
  )
75
+ names[0] = names[0].split("/")[-1]
76
 
77
  self.name = "-".join(names) + f"-{self.split}"
78
  if isinstance(self.prompt, str):
 
84
  }
85
  self.label_column = self.label_column or self.input_column
86
 
 
 
 
87
  @cached_property
88
  def samples(self):
89
  return self.dataset[self.input_column]
90
 
 
 
 
 
 
 
 
 
91
  @cached_property
92
  def dataset(self):
93
  ds = (
 
118
  shots = shots.map(
119
  lambda example: {
120
  self.input_column: example[self.input_column]
 
121
  + example[self.label_column],
122
  }
123
  )[self.input_column]
 
140
  if isinstance(self.metric_name, str)
141
  else load(*self.metric_name)
142
  )
143
+ return metric
 
 
 
 
 
 
 
144
 
145
+ # @cache
146
  def run(
147
  self,
148
  pipeline,
149
  ):
150
+ if (outputs := pipeline(self.samples)) is None:
151
+ logging.warning("pipeline returns None")
152
+ return
153
+ self.outputs = outputs
154
+ try:
155
+ try:
156
+ result = self.metric._compute(
157
+ responses=outputs, references=self.dataset[self.label_column]
158
+ )
159
+ except Exception as e:
160
+ result = self.metric.compute(
161
+ responses=outputs, references=self.dataset[self.label_column]
162
+ )
163
+ except Exception as e:
164
+ result = outputs
165
+ # if log:
166
+ # name = name or pipeline.__name__
167
+ # self.results[name] = result
168
 
169
+ return result
 
 
 
170
 
171
 
172
  def multichoice(responses: Any, references: list[str]):
 
190
  class Metrics:
191
  cmmlu = multichoice_zh
192
  mmlu = multichoice
193
+
194
+ def ceval(responses: list[str], answers: list[str | int]):
195
+ responses = [extract_choice_zh(pred) for pred in responses]
196
+ return responses, answers
197
 
198
  def winogrande(responses: list[str], answers: list[str | int]):
199
  responses = [first_option_postprocess(pred, options="AB") for pred in responses]
 
218
  return responses, answers
219
 
220
  def drop(responses: list[str], answers: list[list]):
221
+ if len(responses) != len(answers):
222
+ return {"error": "predictions and references have different " "length"}
223
+ responses = [general_postprocess(pred) for pred in responses]
224
+ processed_answers = [[general_postprocess(j) for j in i] for i in answers]
225
+ matched_answers = []
226
+ for pred, ans, origin_ans in zip(responses, processed_answers, answers):
227
+ if pred in ans or pred in origin_ans:
228
+ matched_answers.append(pred)
229
+ else:
230
+ matched_answers.append(ans[0])
231
+
232
+ return responses, matched_answers
233
 
234
  def bbh_mcq(responses: list[str], answers: list[str | int]):
235
  if len(responses) != len(answers):
 
260
 
261
  return responses, answers
262
 
 
 
 
 
 
 
 
263
  def MATH(responses: list[str], answers: list[str]):
264
+ extract_responses = []
265
+ for response in responses:
266
+ indices = [pos for pos, char in enumerate(response) if char == "$"]
267
+ if len(indices) <= 2:
268
+ ans = ""
269
+ else:
270
+ ans = response[indices[-2] + 1 : indices[-1]]
271
+ extract_responses.append(strip_string(ans))
272
+ extract_answers = [strip_string(get_answer(answer)) for answer in answers]
273
+ return extract_responses, extract_answers
 
274
 
275
 
276
  class CMMLU:
 
283
  for choice in list("ABCD"):
284
  prompt += f"\n{choice}. {example[choice]}"
285
 
286
+ prompt += "\n答案:"
287
  return {"prompt": prompt}
288
 
289
  subcategories = {
 
380
  ],
381
  "Other": ["other"],
382
  "China specific": ["china specific"],
383
+ "Test": ["computer science"],
384
  }
385
 
386
  @classmethod
 
607
  input_column = "input"
608
  label_column = "answers"
609
 
610
+ icl_prompt = """\
611
+ Text: In the county, the population was spread out with 23.50% under the age of 18, 8.70% from 18 to 24, 29.70% from 25 to 44, 24.70% from 45 to 64, and 13.30% who were 65 years of age or older.
612
+ Question: How many more percent are under the age of 18 compared to the 18 to 24 group?
613
+ Anawer: According to the text, 23.5% are under the age of 18, and 8.7% are from ages 18 to 24. 23.5%-8.7%=14.8%. So the answer is 14.8.
614
+
615
+ Text: Playing in their second straight Thanksgiving game, the Eagles struggled especially on defense, where they were unable to stop the much-hyped Lions offense. The worst of it all was how unproven rookie Eric Rowe was tasked with covering wide receiver Calvin Johnson, leading to Johnson catching 3 touchdowns. Stafford’s five passing touchdowns, including three of them to Johnson was too much for the Eagles to overcome and for the second consecutive time this season, the Eagles gave up 45 points in a game. With the loss, the Eagles drop to 4-7 on the season and 6-1 when playing on Thanksgiving.
616
+ Question: How many TD passes did Stafford throw other than to Johnson?
617
+ Anawer: According to the text, Stafford threw 5 TD passes, 3 of which were to Johnson. 5-3=2. So the answer is 2.
618
+
619
+ Text: [PROMPT]
620
+ Question: [QUESTION]
621
+ Anawer:"""
622
+
623
+ categories = ["validation"]
624
+
625
  @classmethod
626
  def prompt_drop(cls, example):
627
+ prompt = cls.icl_prompt.replace("[PROMPT]", example["passage"]).replace(
628
+ "[QUESTION]", example["question"]
629
+ )
630
 
631
+ validated_answers = example["answers_spans"]["spans"]
632
+ validated_types = example["answers_spans"]["types"]
633
+ answers = []
634
+ for answer_item, answer_type in zip(validated_answers, validated_types):
635
+ # if answer_type == "number":
636
+ # answers.append(answer_item)
637
+ # elif any(answer_item['date'][i] for i in ['day', 'month', 'year']):
638
+ # d = [answer_item['date'][i] for i in ['day', 'month', 'year']]
639
+ # answers.append(' '.join(d).strip())
640
+ # else:
641
+ # for span in answer_item['spans']:
642
+ # answers.append(span)
643
+ answers.append(answer_item)
644
+ answers = list(set(answers))
645
+
646
+ return {cls.input_column: prompt, cls.label_column: answers}
647
 
648
  @classmethod
649
  def suite(
650
  cls,
651
  ):
652
+ finer_categories = (
653
+ pd.Series(cls.categories) # noqa # type: ignore
654
+ .explode()
655
+ .reset_index()
656
+ .set_index(0)
657
+ .groupby(0)
658
+ .agg(list)["index"]
659
+ .to_dict()
 
660
  )
661
+ suite = defaultdict(list)
662
+ categories = list(finer_categories.keys())
663
+ for cate in categories:
664
+ suite[cate].append(
665
+ Task(
666
+ ("drop", cate),
667
+ metric_name=("sustech/tlem", "drop"),
668
+ input_column=cls.input_column,
669
+ label_column=cls.label_column,
670
+ prompt=partial(cls.prompt_drop),
671
+ few_shot=0,
672
+ split="validation",
673
+ )
674
+ )
675
+
676
+ return suite
677
 
678
 
679
  class HellaSwag:
 
744
 
745
  @classmethod
746
  def suite(cls):
747
+ finer_categories = (
748
+ pd.Series(cls.categories) # noqa # type: ignore
749
+ .explode()
750
+ .reset_index()
751
+ .set_index(0)
752
+ .groupby(0)
753
+ .agg(list)["index"]
754
+ .to_dict()
755
+ )
756
+ suite = defaultdict(list)
757
+ categories = list(finer_categories.keys())
758
+ for cate in categories:
759
+ suite[cate].append(
760
+ Task(
761
+ ("ai2_arc", cate),
762
+ metric_name=("sustech/tlem", "arc"),
763
+ input_column=cls.input_column,
764
+ label_column=cls.label_column,
765
+ prompt=partial(cls.prompt_arc),
766
+ few_shot=0,
767
+ )
768
  )
 
 
769
 
770
  return suite
771
 
 
827
  def suite(
828
  cls,
829
  ):
830
+ finer_categories = (
831
+ pd.Series(
832
+ cls.bbh_free_form_sets + cls.bbh_multiple_choice_sets
833
+ ) # noqa # type: ignore
834
+ .explode()
835
+ .reset_index()
836
+ .set_index(0)
837
+ .groupby(0)
838
+ .agg(list)["index"]
839
+ .to_dict()
840
+ )
841
+ suite = defaultdict(list)
842
+ categories = list(finer_categories.keys())
843
+ for cate in categories:
844
+ if cate in cls.bbh_multiple_choice_sets:
845
+ suite[cate].append(
846
+ Task(
847
+ ("lukaemon/bbh", cate),
848
+ metric_name=("sustech/tlem", "bbh_mcq"),
849
+ input_column=cls.input_column,
850
+ label_column=cls.label_column,
851
+ prompt=partial(cls.prompt_bbh, category=cate),
852
+ few_shot=0,
853
+ )
854
  )
855
+ else:
856
+ suite[cate].append(
857
+ Task(
858
+ ("lukaemon/bbh", cate),
859
+ metric_name=("sustech/tlem", "bbh_freefrom"),
860
+ input_column=cls.input_column,
861
+ label_column=cls.label_column,
862
+ prompt=partial(cls.prompt_bbh, category=cate),
863
+ few_shot=0,
864
+ )
865
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
866
 
867
  return suite
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
868
 
 
869
 
870
  class CEVAL:
871
  input_column = "input"
tlem.py CHANGED
@@ -1,6 +1,12 @@
1
- import logging
 
 
 
 
 
2
 
3
  from typing import Any, Optional, Protocol, Iterable, Callable
 
4
  from tqdm.auto import tqdm
5
  from evaluate.evaluation_suite import EvaluationSuite
6
  import evaluate
@@ -8,10 +14,7 @@ import numpy as np
8
  import datasets
9
  import pandas as pd
10
  from .tasks import *
11
- from .utils import *
12
- from itertools import chain
13
- from copy import deepcopy
14
- from . import utils
15
 
16
 
17
  class ReasoningMetric(evaluate.Metric):
@@ -43,60 +46,32 @@ class ReasoningMetric(evaluate.Metric):
43
  reference_urls=["http://path.to.reference.url/new_module"],
44
  )
45
 
46
- def _compute(self, responses, references):
47
- return_value = getattr(Metrics, self.config_name)(responses, references)
48
- match return_value:
49
- case extract_responses, extract_references:
50
- results = {
51
- self.config_name: np.mean(
52
- sync_pipe(lambda x, y: x == y)(
53
- zip(extract_responses, extract_references)
54
- )
55
- )
56
- }
57
- case dict():
58
- results = return_value
59
-
60
- case list():
61
- results = {self.config_name: np.mean(return_value)}
62
-
63
- case _:
64
- raise NotImplementedError
65
-
 
66
  return results
67
 
68
 
69
  class Suite(EvaluationSuite):
70
  task_class = Task
71
- utils = utils
72
- supported_datasets = [
73
- "arc",
74
- "hellaswag",
75
- "mmlu-chat",
76
- "winogrande",
77
- "gsm8k",
78
- "cmmlu-chat",
79
- "ceval-chat",
80
- "bbh",
81
- "drop",
82
- "MATH",
83
- ]
84
-
85
- def __getitem__(self, key) -> Task:
86
- match key:
87
- case str():
88
- return self.suite[key]
89
- case slice() | int():
90
- return self.tasks[key]
91
-
92
- def agg(self, suite):
93
- for cate, tasks in suite.items():
94
- if isinstance(tasks, dict):
95
- suite[cate] = self.agg(tasks)
96
- else:
97
- suite[cate] = np.mean([pd.Series(task.result).mean() for task in tasks])
98
-
99
- return suite
100
 
101
  def run(
102
  self,
@@ -104,26 +79,28 @@ class Suite(EvaluationSuite):
104
  ) -> dict[str, float]:
105
  self.assert_suite_nonempty()
106
 
107
- self.suite: dict[str, list[Task]]
108
- for task in (bar := tqdm(self.tasks)):
109
- bar.desc = f"complete {task.name}."
110
- _ = task.run(model_or_pipeline)
111
- logging.info(f"{task.name} {task.result=}")
112
- return self.agg(deepcopy(self.suite))
 
113
 
114
- def arun(self, model_or_pipeline):
115
- async def sync_function():
116
- return await tqdm.gather(
117
- *[task.arun(model_or_pipeline) for task in self.tasks], leave=False
118
- )
 
119
 
120
- asyncio.run(sync_function())
121
 
122
- return self.agg(deepcopy(self.suite))
 
123
 
124
- def get_suite(self, name) -> dict[str, Task]:
125
  chat = False
126
- suite={}
127
  match name:
128
  case _ if "chat" in name:
129
  chat = True
@@ -132,8 +109,6 @@ class Suite(EvaluationSuite):
132
  suite = MMLU.suite(chat=chat)
133
  case _ if name.startswith("cmmlu"):
134
  suite = CMMLU.suite(chat=chat)
135
- case _ if name.startswith("ceval"):
136
- suite = CEVAL.suite(chat=chat)
137
  case "gsm8k":
138
  suite = Task(
139
  dataset_name=("gsm8k", "main"),
@@ -151,10 +126,8 @@ class Suite(EvaluationSuite):
151
  suite = DROP.suite()
152
  case "winogrande":
153
  suite = Winogrande.suite()
154
- case "truthfulqa_mc1":
155
- suite = TruthfulQAMC1.suite()
156
- case _ if name.startswith("boolq"):
157
- suite = BoolQ.suite(chat=chat)
158
  case "mt_bench":
159
  suite = Task(
160
  dataset_name="SUSTech/mt_bench_judge",
@@ -165,78 +138,16 @@ class Suite(EvaluationSuite):
165
  case "MATH" | "competition_math":
166
  suite = Task(
167
  dataset_name="hendrycks/competition_math",
168
- prompt="This is a math problem, please think step by step and slove it: {input_column}. Simplify your final answer as much as possible and surround them with '$' in TeX form.",
 
169
  metric_name=("sustech/tlem", "MATH"),
170
  input_column="problem",
171
  label_column="solution",
172
  )
173
 
174
- case "open-leaderboard":
175
- for name in [
176
- "arc",
177
- "hellaswag",
178
- "mmlu-chat",
179
- "winogrande",
180
- "gsm8k",
181
- # "truthful_qa",
182
- "drop",
183
- ]:
184
- suite.update(self.get_suite(name))
185
- case "tlem":
186
- for name in [
187
- "arc",
188
- "hellaswag",
189
- "mmlu-chat",
190
- "winogrande",
191
- "gsm8k",
192
- # "truthful_qa",
193
- "cmmlu-chat",
194
- "ceval-chat",
195
- "bbh",
196
- ]:
197
- suite.update(self.get_suite(name))
198
-
199
- case "all":
200
- for name in self.supported_datasets:
201
- suite.update(self.get_suite(name))
202
- case _:
203
- raise NotImplementedError(
204
- f"{name} is not supported in {self.supported_datasets}"
205
- )
206
-
207
- if isinstance(suite, Task):
208
- suite = [suite]
209
- suite = {name: suite}
210
-
211
- return suite
212
-
213
- def singleton(self, task):
214
- try:
215
- return self.tasks[self.tasks.index(task)]
216
- except ValueError:
217
- logging.debug(f"add {task.name} to suite.")
218
- self.tasks.append(task)
219
- logging.debug(self.tasks)
220
- return self.tasks[-1]
221
-
222
- def drop_duplicates(self, suite):
223
- for category, tasks in suite.items():
224
- match tasks:
225
- case list():
226
- suite[category] = [self.singleton(task) for task in tasks]
227
- case dict():
228
- suite[category] = self.drop_duplicates(tasks)
229
- case _:
230
- raise NotImplementedError
231
- return suite
232
-
233
- def load(self, name):
234
- sub_suite = self.get_suite(name)
235
- self.suite.update(sub_suite)
236
- self.suite = self.drop_duplicates(self.suite)
237
- # return self
238
 
239
  def __init__(self, name="tlem"):
240
  super().__init__(name)
241
- self.tasks = []
242
- self.suite = {}
 
1
+ # %%
2
+
3
+ try:
4
+ from ipytorch import logging
5
+ except Exception as e:
6
+ import logging
7
 
8
  from typing import Any, Optional, Protocol, Iterable, Callable
9
+ from numpy.lib import extract
10
  from tqdm.auto import tqdm
11
  from evaluate.evaluation_suite import EvaluationSuite
12
  import evaluate
 
14
  import datasets
15
  import pandas as pd
16
  from .tasks import *
17
+ from .utils import is_equiv
 
 
 
18
 
19
 
20
  class ReasoningMetric(evaluate.Metric):
 
46
  reference_urls=["http://path.to.reference.url/new_module"],
47
  )
48
 
49
+ def _compute(self, responses, references, verbose=False):
50
+ extract_responses, extract_references = getattr(Metrics, self.config_name)(
51
+ responses, references
52
+ )
53
+ df = pd.DataFrame(
54
+ {
55
+ "responses": responses,
56
+ "references": references,
57
+ }
58
+ )
59
+ df["extract_responses"] = extract_responses
60
+ df["extract_references"] = extract_references
61
+ # print(df)
62
+ results = {
63
+ "Accuracy": (df["extract_references"] == df["extract_responses"])
64
+ .astype(int)
65
+ .mean(),
66
+ }
67
+ logging.info(results)
68
+ if verbose:
69
+ results["df"] = df
70
  return results
71
 
72
 
73
  class Suite(EvaluationSuite):
74
  task_class = Task
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  def run(
77
  self,
 
79
  ) -> dict[str, float]:
80
  self.assert_suite_nonempty()
81
 
82
+ def run_tasks(tasks):
83
+ for task in (bar := tqdm(tasks, leave=False)):
84
+ bar.desc = f"complete {task.name}."
85
+ if task.name not in self.cached_result:
86
+ self.cached_result[task.name] = task.run(model_or_pipeline)
87
+ results = [self.cached_result[task.name] for task in tasks]
88
+ return pd.DataFrame(results).mean().to_dict()
89
 
90
+ if isinstance(self.suite, dict):
91
+ for category, tasks in (bar := tqdm(self.suite.items())):
92
+ bar.desc = f"complete {category}."
93
+ logging.warning(f"Combined results {category}: {run_tasks(tasks)}")
94
+ else:
95
+ logging.warning(f"Combined results: {run_tasks(self.suite)}")
96
 
97
+ return self.cached_result
98
 
99
+ def add(self, name):
100
+ self.load(name)
101
 
102
+ def load(self, name):
103
  chat = False
 
104
  match name:
105
  case _ if "chat" in name:
106
  chat = True
 
109
  suite = MMLU.suite(chat=chat)
110
  case _ if name.startswith("cmmlu"):
111
  suite = CMMLU.suite(chat=chat)
 
 
112
  case "gsm8k":
113
  suite = Task(
114
  dataset_name=("gsm8k", "main"),
 
126
  suite = DROP.suite()
127
  case "winogrande":
128
  suite = Winogrande.suite()
129
+ case _ if name.startswith("ceval"):
130
+ suite = CEVAL.suite(chat=chat)
 
 
131
  case "mt_bench":
132
  suite = Task(
133
  dataset_name="SUSTech/mt_bench_judge",
 
138
  case "MATH" | "competition_math":
139
  suite = Task(
140
  dataset_name="hendrycks/competition_math",
141
+ split="test",
142
+ prompt="This is a math problem, please think step by step and slove it: {input_column}",
143
  metric_name=("sustech/tlem", "MATH"),
144
  input_column="problem",
145
  label_column="solution",
146
  )
147
 
148
+ self.suite = [suite] if isinstance(suite, Task) else suite
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
  def __init__(self, name="tlem"):
151
  super().__init__(name)
152
+ self.cached_result = {}
153
+ self.suite = []
utils.py CHANGED
@@ -2,8 +2,6 @@ import logging
2
  import re
3
  import numpy as np
4
  from typing import Any
5
- from tqdm.auto import tqdm
6
- import asyncio
7
 
8
  NUMERIC_IN_EN = r"(?:[\s=+-/<>($:\.\*\\])(?=\S)((?:0|(?:\d{1,3}(?:,\d{3})+(?=\D|$))|(?:\d+))(?:\.\d+)?%?)(?:(?![^\s=+-/>)$:\.\*\\])|(?=, ))"
9
  NUMERIC_IN_ZH = (
@@ -11,38 +9,6 @@ NUMERIC_IN_ZH = (
11
  )
12
 
13
 
14
- def async_pipe(func):
15
- async def sync_function(samples):
16
- if not isinstance(samples, list):
17
- samples = [samples]
18
- return await tqdm.gather(*[func(sample) for sample in samples], leave=False)
19
-
20
- def sync_func(samples):
21
- return asyncio.run(sync_function(samples))
22
-
23
- return sync_func
24
-
25
-
26
- def sync_pipe(func, progress=False):
27
- def sync_func(samples):
28
- return [
29
- func(*sample) if isinstance(sample, tuple) else func(sample)
30
- for sample in tqdm(samples, disable=not progress, leave=False)
31
- ]
32
-
33
- return sync_func
34
-
35
-
36
- def asis_backup(func):
37
- def wrapper(sample):
38
- try:
39
- return func(sample)
40
- except Exception:
41
- return sample
42
-
43
- return wrapper
44
-
45
-
46
  def extract_choice_zh(gen):
47
  # 答案是A | 选项是A | 应该选A选项
48
  res = re.search(
@@ -74,27 +40,27 @@ def extract_choice_zh(gen):
74
  def extract_choice(gen):
75
  # answer is A | choice is A | choose A
76
  res = re.search(
77
- r"(?:(?:[Cc]hoose)|(?:(?:[Aa]nswer|[Cc]hoice)(?![^ABCDEFGHIJKL]{0,20}?(?:n't|not))[^ABCDEFGHIJKL]{0,10}?\b(?:|is|:|be))\b)[^ABCDEFGHIJKL]{0,20}?\b(A|B|C|D|E|F|G|H|I|J|K|L)\b",
78
  gen,
79
  )
80
 
81
  # A is correct | A is right
82
  if res is None:
83
  res = re.search(
84
- r"\b(A|B|C|D|E|F|G|H|I|J|K|L)\b(?![^ABCDEFGHIJKL]{0,8}?(?:n't|not)[^ABCDEFGHIJKL]{0,5}?(?:correct|right))[^ABCDEFGHIJKL]{0,10}?\b(?:correct|right)\b",
85
  gen,
86
  )
87
 
88
  # straight answer: A
89
  if res is None:
90
- res = re.search(r"^(A|B|C|D|E|F|G|H|I|J|K|L)(?:\.|,|:|$)", gen)
91
 
92
  # simply extract the first appearred letter
93
  if res is None:
94
- res = re.search(r"(?<![a-zA-Z])(A|B|C|D|E|F|G|H|I|J|K|L)(?![a-zA-Z=])", gen)
95
 
96
  if res is None:
97
- res = "L"
98
 
99
  if isinstance(res, str):
100
  return res
@@ -136,20 +102,17 @@ def extract_numeric(string, pattern=NUMERIC_IN_EN) -> str:
136
 
137
 
138
  def remove_boxed(s):
139
- try:
140
- if (left := "\\boxed ") in s:
141
- assert s[: len(left)] == left, s
142
- return s[len(left) :]
143
- elif (left := "\\boxed{") in s:
144
- assert s[: len(left)] == left, s
145
- return s[len(left) : -1]
 
 
146
 
147
- elif (left := "\\fbox{") in s:
148
- assert s[: len(left)] == left, s
149
- return s[len(left) : -1]
150
- raise ValueError(f"Cannot remove boxed from {s}")
151
- except AssertionError:
152
- return s
153
 
154
 
155
  def last_boxed_only_string(string):
@@ -182,33 +145,183 @@ def last_boxed_only_string(string):
182
  return retval
183
 
184
 
185
- @asis_backup
186
- def get_answer(string):
187
- if boxed := last_boxed_only_string(string):
188
- return remove_boxed(boxed)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  else:
190
- indices = [pos for pos, char in enumerate(string) if char == "$"]
191
- if len(indices) < 2:
192
- return extract_numeric(string)
193
- string = string[indices[-2] + 1 : indices[-1]]
194
- return string.split("=")[-1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
197
  def first_option_postprocess(text: str, options: str) -> str:
198
  """Find first valid option for text."""
199
 
200
  patterns = [
201
- f"[Tt]he answer is [{options}]",
202
- f"[Tt]he correct answer\s?(?:option)?\s?is [{options}]", # noqa
203
- f"答案(?:选项)?是(.*?)[{options}]",
204
- f"答案(?:选项)?为(.*?)[{options}]",
205
- f"答案(?:选项)?选(.*?)[{options}]",
206
- f"选项[{options}]是?正确",
207
- f"选项[{options}]为?正确",
208
- f"固选(.*?)[{options}]",
209
- f"答案应该是(.*?)[{options}]",
210
- f"(\s|^)[{options}][\s。,,\.$]", # noqa
211
- f"[{options}]",
212
  ]
213
 
214
  regexes = [re.compile(pattern) for pattern in patterns]
@@ -219,41 +332,44 @@ def first_option_postprocess(text: str, options: str) -> str:
219
  for i in options:
220
  if i in outputs:
221
  return i
222
- return ""
223
 
224
 
225
  def first_capital_postprocess(text: str) -> str:
226
  for t in text:
227
  if t.isupper():
228
  return t
229
- return ""
230
 
231
 
232
  def general_postprocess(text: str) -> str:
233
  # Cut off the first newline, period, or comma
234
- truncated_text = re.split(r"[\n.,]", text, 1)[0]
235
 
236
  # Remove punctuation
237
- no_punctuation = re.sub(r"[^\w\s]", "", truncated_text)
238
 
239
  # Remove article
240
- no_articles = re.sub(r"\b(a|an|the)\b", "", no_punctuation, flags=re.IGNORECASE)
 
 
 
241
 
242
  # Remove duplicated blank spaces
243
- cleaned_text = re.sub(r"\s+", " ", no_articles).strip()
244
 
245
  return cleaned_text
246
 
247
-
248
  def bbh_mcq_postprocess(text: str) -> str:
249
  ans = text
250
- ans_line = ans.split("answer is ")
251
  if len(ans_line) != 1:
252
  ans = ans_line[-1].strip()
253
- match = re.search(r"\(([A-Z])\)*", ans)
254
  if match:
255
  return f"({match.group(1)})"
256
- match = re.search(r"([A-Z])", ans)
257
  if match:
258
  return f"({match.group(1)})"
259
  return f"({ans})"
@@ -261,17 +377,18 @@ def bbh_mcq_postprocess(text: str) -> str:
261
 
262
  def bbh_freeform_postprocess(text: str) -> str:
263
  ans = text
264
- ans_line = ans.split("answer is ")
265
  if len(ans_line) != 1:
266
  ans = ans_line[-1].strip()
267
- ans = ans.split("\n")[0]
268
- if ans.endswith("."):
269
  ans = ans[:-1]
270
  return ans
271
 
272
 
 
273
  icl_prompts = {
274
- "temporal_sequences": """Task description: Answer questions about which times certain events could have occurred.
275
 
276
  Q: Today, Emily went to the museum. Between what times could they have gone?
277
  We know that:
@@ -348,7 +465,9 @@ Wake-up time: 5am.
348
  3pm-4pm: free.
349
  The beach closure time: 4pm.
350
  The only time when Tiffany could have gone to the beach was 3pm to 4pm. So the answer is (D).""",
351
- "disambiguation_qa": """Clarify the meaning of sentences with ambiguous pronouns.
 
 
352
 
353
  Q: In the following sentences, explain the antecedent of the pronoun (which thing the pronoun refers to), or state that it is ambiguous.
354
  Sentence: The chief told the counselor that they took the day off.
@@ -385,7 +504,9 @@ Here we need to determine who the pronoun "his" might be referring to. There are
385
  Let's consider Y first: "X will plan to meet Y at Y's office." This case makes sense, because X might want to meet up with Y at Y's office.
386
  Now, consider X: "X will plan to meet Y at X's office." This case also makes sense, because X might want to meet up with Y at X's own office.
387
  Because both X and Y are possible at the same time, we conclude that the antecedent of the pronoun is ambiguous. So the answer is (C).""",
388
- "date_understanding": """Infer the date from context.
 
 
389
 
390
  Q: Today is Christmas Eve of 1937. What is the date 10 days ago in MM/DD/YYYY?
391
  Options:
@@ -418,7 +539,9 @@ Options:
418
  (F) 12/03/1960
419
  A: Let's think step by step.
420
  If Jane and John married on Jan 2, 1958, then and if it is their 5-year anniversary today, then today's date is Jan 2, 1963. The date tomorrow is Jan 3, 1963, that is 01/03/1963. So the answer is (B).""",
421
- "tracking_shuffled_objects_three_objects": """A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.
 
 
422
 
423
  Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.
424
  As the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the
@@ -458,7 +581,9 @@ A: Let's think step by step.
458
  (2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
459
  (3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
460
  At the end of the dance, Alice is dancing with Patrick. So the answer is (C).""",
461
- "penguins_in_a_table": """Answer questions about a table of penguins and their attributes.
 
 
462
 
463
  Q: Here is a table where the first line is a header and each subsequent line is a penguin: name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15 For example: the age of Louis is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm. We now add a penguin to the table:
464
  James, 12, 90, 12
@@ -499,7 +624,9 @@ This question focuses on the name. We know the following: The names of the pengu
499
  When we sort their names alphabetically, we get Bernard, Gwen, Louis, Vincent.
500
  The name of the second penguin sorted by alphabetical order is Gwen.
501
  The name of the second penguin sorted by alphabetic order is Gwen. So the answer is (D).""",
502
- "geometric_shapes": """Name geometric shapes from their SVG paths.
 
 
503
 
504
  Q: This SVG path element <path d="M 31.00,73.00 L 32.00,59.00 L 44.00,50.00 L 49.00,41.00 L 64.00,37.00 L 71.00,55.00 L 64.00,76.00 L 52.00,61.00 L 31.00,73.00"/> draws a
505
  Options:
@@ -577,7 +704,9 @@ Length of side B: |B| = sqrt((37.00-41.00)^2 + (34.00-33.00)^2)) = sqrt((4)^2 +
577
  Length of side C: |C| = sqrt((41.00-45.00)^2 + (33.00-34.00)^2)) = sqrt((-4)^2 + (-1)^2) = sqrt(16 + 1) = sqrt(17).
578
  Length of side D: |D| = sqrt((45.00-41.00)^2 + (34.00-43.00)^2)) = sqrt((4)^2 + (-9)^2) = sqrt(16 + 81) = sqrt(97).
579
  Note that |A| = |D| and |B| = |C|. Furthermore, A and D are adjacent and B and C are adjacent. Thus, this polygon has two pairs of equal-length adjacent sides and is "kite". So the answer is (D).""",
580
- "snarks": """Determine which of two sentences is sarcastic.
 
 
581
 
582
  According to Cambridge University Dictionary, sarcasm is "the use of remarks that clearly mean the opposite of what they say, made in order to hurt someone's feelings or to criticize something in a humorous way." Sarcastic sentences often contain satirical or ironic utterances, hyperboles, ambivalent or witty remarks.
583
 
@@ -607,7 +736,9 @@ A: Let's think step by step.
607
  If we look at (A), it likens the consistency in the league's punishments with that in politics. Because politics or political affairs are often not considered to be consistent or dependable, this sentence appears to be satirical.
608
  If we look at (B), it likens the consistency in the league's punishments with that in morality. Discussing the consistency of the league's punishments in the context of morality, ethics, or law makes sense and does not appear to make a satirical point about anything.
609
  Above the above, the sarcastic option is (A). So the answer is (A).""",
610
- "ruin_names": """Select the humorous edit that 'ruins' the input movie or musical artist name.
 
 
611
 
612
  Q: Which of the following is a humorous edit of this artist or movie name: 'whitesnake'?
613
  Options:
@@ -651,7 +782,9 @@ The original name is "counting crows". This is the name of an American rock band
651
  (C) "courting crows": Here the word "counting" is changed to "courting", and "courting" is an actual word; however, "courting crows" does not sound as humorous as "counting cows".
652
  (D) "coutnting crows": Here the word "counting" is changed to "coutnting", but the word "coutnting" is not an actual word.
653
  Above the above, the only humorous edit is (B). So the answer is (B).""",
654
- "tracking_shuffled_objects_seven_objects": """A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.
 
 
655
 
656
  Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.
657
  As the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the
@@ -691,7 +824,9 @@ A: Let's think step by step.
691
  (2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
692
  (3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
693
  At the end of the dance, Alice is dancing with Patrick. So the answer is (C).""",
694
- "tracking_shuffled_objects_five_objects": """A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.
 
 
695
 
696
  Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.
697
  As the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the
@@ -731,7 +866,9 @@ A: Let's think step by step.
731
  (2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
732
  (3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
733
  At the end of the dance, Alice is dancing with Patrick. So the answer is (C).""",
734
- "logical_deduction_three_objects": """A logical deduction task which requires deducing the order of a sequence of objects.
 
 
735
 
736
  Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.
737
  Options:
@@ -768,7 +905,9 @@ A: Let's think step by step.
768
  (3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
769
  According to this ordering, the leftmost book is the white book.
770
  The white book is the leftmost. So the answer is (C).""",
771
- "hyperbaton": """Order adjectives correctly in English sentences.
 
 
772
 
773
  Q: Which sentence has the correct adjective order:
774
  Options:
@@ -796,7 +935,9 @@ A: Let's think step by step.
796
  When there is more than one adjective before a noun, the adjectives need to respect the following order before a noun: "[1. opinion] [2. size] [3. age] [4. shape] [5. color] [6. origin] [7. material] [8. purpose] noun".
797
  Option (A): "blue gold wonderful square shoe". (1) "blue" falls into the color category. (2) "gold" falls into the material category. (3) "wonderful" falls into the opinion category. (4) "square" falls into the shape category. The adjective order that Option (A) has is [5. color] [7. material] [1. opinion] [4. shape] (or, in numeric terms, 5 7 1 4). Because 5 < 7 < 1 < 4 is not correct, (A) does not have the correct ordering.
798
  Option (B): "wonderful square blue gold shoe". Option (B) has the following adjective order: [1. opinion] [4. shape] [5. color] [7. material] (or, in numeric terms, 1 4 5 7 ). Because 1 < 4 < 5 < 7 is correct, (B) has the correct ordering. So the answer is (B).""",
799
- "logical_deduction_five_objects": """A logical deduction task which requires deducing the order of a sequence of objects.
 
 
800
 
801
  Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.
802
  Options:
@@ -833,7 +974,9 @@ A: Let's think step by step.
833
  (3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
834
  According to this ordering, the leftmost book is the white book.
835
  The white book is the leftmost. So the answer is (C).""",
836
- "logical_deduction_seven_objects": """A logical deduction task which requires deducing the order of a sequence of objects.
 
 
837
 
838
  Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.
839
  Options:
@@ -870,7 +1013,9 @@ A: Let's think step by step.
870
  (3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
871
  According to this ordering, the leftmost book is the white book.
872
  The white book is the leftmost. So the answer is (C).""",
873
- "movie_recommendation": """Recommend movies similar to the given list of movies.
 
 
874
 
875
  Q: Find a movie similar to Star Wars Episode IV - A New Hope, Indiana Jones and the Last Crusade, Star Wars Episode V - The Empire Strikes Back, The Big Lebowski:
876
  Options:
@@ -912,7 +1057,9 @@ A: Let's think step by step.
912
  - Inside Out (animation, family, comedy; 2015)
913
  - Forrest Gump (comedy, drama, romance; 1994)
914
  These are all famous movies produced in the past few decades.Amongst all the options, the only movie similar to these ones seems to be Edge of Tomorrow (action, adventure, crime, mystery; 2014), as it is also a science-fiction movie and features Tom Cruise. So the answer is (D).""",
915
- "salient_translation_error_detection": """Detect the type of error in an English translation of a German source sentence.
 
 
916
 
917
  Q: The following translations from German to English contain a particular error. That error will be one of the following types: Named Entities: An entity (names, places, locations, etc.) is changed to a different entity. Numerical Values: Numerical values (ordinals or cardinals), dates, and/or units are changed. Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are changed. Negation or Antonyms: Introduce or remove a negation or change comparatives to their antonyms. Facts: Trivial factual errors not pertaining to the above classes are introduced in the translations. Dropped Content: A significant clause in the translation is removed. Please identify that error. Source: In der Liste der Baudenkmale in Lenzen (Elbe) sind alle Baudenkmale der brandenburgischen Stadt Lenzen (Elbe) und ihrer Ortsteile aufgelistet.
918
  Translation: In the list of architectural monuments in Lenzen all architectural monuments of the Brandenburg city of Lenzen and its districts are listed.
@@ -952,7 +1099,9 @@ Options:
952
  (F) Facts
953
  A: Let's think step by step.
954
  We solve this question by first translating the source sentence to English and then by comparing our translation with the provided translation. According to Google Translate, the correct translation of the source sentence from German to English is "Łeba is a small town and seaside resort in the Powiat Lęborski of the Polish Pomeranian Voivodeship." On the other hand, the provided translation is "Łeba is not a small town and seaside resort in the Powiat Léborski county of the Pomeranian Voivodeship of Poland." Note that the provided sentence says, "Łeba is not a small town ..." However, the translation should have been "Łeba is a small town ..." Because a negation is introduced at the beginning of the sentence and has fundamentally changed the meaning of the original source, the translation contains an error pertaining to Negation or Antonyms. So the answer is (C).""",
955
- "reasoning_about_colored_objects": """Answer extremely simple questions about the colors of objects on a surface.
 
 
956
 
957
  Q: On the nightstand, there is a red pencil, a purple mug, a burgundy keychain, a fuchsia teddy bear, a black plate, and a blue stress ball. What color is the stress ball?
958
  Options:
@@ -1015,7 +1164,8 @@ A: Let's think step by step.
1015
  According to this question, the objects are arranged in a row, from left to right, as follows: (1) a teal plate, (2) a burgundy keychain, (3) a yellow scrunchiephone charger, (4) an orange mug, (5) a pink notebook, (6) a grey cup.
1016
  The teal plate is the first item, namely (1). There is no item to the left of the teal item.
1017
  The number of non-orange items to the left of the teal item is zero. So the answer is (A).""",
1018
- "multistep_arithmetic_two": """Solve multi-step arithmetic problems.
 
1019
 
1020
  Q: ((-5 + 9 * -4 - 0) * (4 + -7 + 0 * -5)) =
1021
  A: Let's think step by step.
@@ -1040,7 +1190,9 @@ This equation can be written as "A - B", where A = (-3 + 5 * 8 * -4) and B = (9
1040
  Let's calculate A = (-3 + 5 * 8 * -4) = (-3 + (5 * 8) * -4) = (-3 + (40) * -4) = (-3 + (40 * -4)) = (-3 + -160) = -163.
1041
  Let's calculate B = (9 - 8 * -7 + -9) = (9 - (8 * -7) + -9) = (9 - (-56) + -9) = ((9 - (-56)) + -9) = ((65) + -9)= (65 - 9) = 56.
1042
  Then, the final equation is A - B = -163 - 56 = -219. So the answer is -219.""",
1043
- "navigate": """Given a series of navigation instructions, determine whether one would end up back at the starting point.
 
 
1044
 
1045
  Q: If you follow these instructions, do you return to the starting point? Turn left. Turn around. Turn left. Take 7 steps. Take 2 steps. Take 4 steps. Take 8 steps.
1046
  Options:
@@ -1083,7 +1235,9 @@ We start at the origin (0, 0), facing the positive y-axis.
1083
  (3) Take 9 steps left: (0, -7), facing the positive y-axis.
1084
  (4) Take 7 steps right: (0, 7), facing the positive y-axis.
1085
  Since (0, 0) is (0, 0), we are indeed where we started. So the answer is Yes.""",
1086
- "dyck_languages": """Correctly close a Dyck-n word.
 
 
1087
 
1088
  Q: Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: [ { [
1089
  A: Let's think step by step.
@@ -1155,7 +1309,9 @@ We should process each input one by one and keep track of the stack configuratio
1155
  Now, we have reached the end. The final stack is "< [ {".
1156
  We will need to pop out "{", "[", "<" one by one in that order.
1157
  So, we need "}", "]", ">". So the answer is } ] >.""",
1158
- "word_sorting": """Sort a list of words.
 
 
1159
 
1160
  Q: Sort the following words alphabetically: List: oven costume counterpart
1161
  A: Let's think step by step.
@@ -1172,7 +1328,9 @@ Q: Sort the following words alphabetically: List: newt arson parthia seismograph
1172
  A: Let's think step by step.
1173
  The first letter: "newt": "n" (14). "arson": "a" (1). "parthia": "p" (16). "seismography": "s" (19). "mugho": "m" (13). "aspect": "a" (1). "census": "c" (3). We now have: (1) ["arson" ? "aspect"] < (3) "census" < (13) "mugho" < (14) "newt" < (16) "parthia" < (19) "seismography". Now let's sort this subpart ["arson" ? "aspect"] by looking at their second letters.
1174
  The second letter: "arson": "r" (18). "aspect": "s" (19). We now have: (18) "arson" < (19) "aspect". Hence, we have ["arson" < "aspect"] < "census" < "mugho" < "newt" < "parthia" < "seismography". So the answer is arson aspect census mugho newt parthia seismography.""",
1175
- "sports_understanding": """Determine whether an artificially constructed sentence relating to sports is plausible or not.
 
 
1176
 
1177
  Q: Is the following sentence plausible? "Bam Adebayo scored a reverse layup in the Western Conference Finals."
1178
  A: Let's think step by step. Bam Adebayo is an American basketball player. Scoring a reverse layup in the Western Conference Finals is part of the NBA Finals. So the answer is yes.
@@ -1182,7 +1340,9 @@ A: Let's think step by step. Santi Cazorla is a soccer player. Touchdown is part
1182
 
1183
  Q: Is the following sentence plausible? "DeMar DeRozan was called for the goal tend."
1184
  A: Let's think step by step. DeMar DeRozan is an American basketball player. Goal tending is part of basketball. So the answer is yes.""",
1185
- "boolean_expressions": """Evaluate the result of a random Boolean expression.
 
 
1186
 
1187
  Q: not ( ( not not True ) ) is
1188
  A: Let's think step by step.
@@ -1205,7 +1365,9 @@ Remember that (i) expressions inside brackets are always evaluated first and tha
1205
  We first simplify this expression "Z" as follows: "Z = not not ( not ( False ) ) = not not ( A )" where "A = not ( False )".
1206
  Let's evaluate A: A = not ( False ) = not False = True.
1207
  Plugging in A, we get: Z = not not ( A ) = not not (True) = not not False = True. So the answer is True.""",
1208
- "object_counting": """Questions that involve enumerating objects and asking the model to count them.
 
 
1209
 
1210
  Q: I have a blackberry, a clarinet, a nectarine, a plum, a strawberry, a banana, a flute, an orange, and a violin. How many fruits do I have?
1211
  A: Let's think step by step.
@@ -1242,7 +1404,9 @@ We first identify the vegetables on the list and include their quantity in paren
1242
  - garlic (1)
1243
  - yam (1)
1244
  Now, let's add the numbers in parentheses: 1 + 1 + 1 + 1 + 2 + 1 + 1 = 8. So the answer is 8.""",
1245
- "formal_fallacies": """Distinguish deductively valid arguments from formal fallacies.
 
 
1246
 
1247
  Q: "It is not always easy to see who is related to whom -- and in which ways. The following argument pertains to this question: To begin with, Lesley is a close friend of Fernando. Moreover, being a close friend of Fernando or a schoolmate of Lowell is sufficient for being a great-grandfather of Leroy. It follows that Lesley is a great-grandfather of Leroy."
1248
  Is the argument, given the explicitly stated premises, deductively valid or invalid?
@@ -1286,7 +1450,9 @@ By (1), we have if X = infrequent-user(Paul Mitchell), then X = rare-consumer(Ni
1286
  The case X = rare-consumer(Nioxin) does not appear in (2).
1287
  The case X = loyal-buyer(Caress) does not appear in (2), either.
1288
  So, from (1) and (2), we cannot necessarily deduce the Hypothesis. So the answer is invalid.""",
1289
- "causal_judgement": """Answer questions about causal attribution.
 
 
1290
 
1291
  Q: How would a typical person answer each of the following questions about causation?
1292
  Frank T., had an ongoing dispute with his neighbor over a stretch of land and one day decided to shoot his neighbor in the body. Frank T. had no experience with guns, his hand slipped on the barrel of the gun, and the shot went wild. Nonetheless, the bullet bounced off a large boulder several feet away and hit the neighbor's body, causing significant injury. Did Frank T. intentionally shoot his neighbor in the body?
@@ -1311,7 +1477,7 @@ Options:
1311
  - No
1312
  A: Let's think step by step.
1313
  Here in this question, we are told that "He aims the dart at the low point region." A typical person might therefore think George did intentionally hit the low point region, because he wanted to lift up the spirit of his sister Lena. So the answer is Yes.""",
1314
- "web_of_lies": """Evaluate a random boolean function expressed as a word problem.
1315
 
1316
  Q: Question: Fidel tells the truth. Jerry says Fidel tells the truth. Vina says Jerry tells the truth. Millicent says Vina lies. Raymond says Millicent lies. Does Raymond tell the truth?
1317
  A: Let's think step by step.
 
2
  import re
3
  import numpy as np
4
  from typing import Any
 
 
5
 
6
  NUMERIC_IN_EN = r"(?:[\s=+-/<>($:\.\*\\])(?=\S)((?:0|(?:\d{1,3}(?:,\d{3})+(?=\D|$))|(?:\d+))(?:\.\d+)?%?)(?:(?![^\s=+-/>)$:\.\*\\])|(?=, ))"
7
  NUMERIC_IN_ZH = (
 
9
  )
10
 
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def extract_choice_zh(gen):
13
  # 答案是A | 选项是A | 应该选A选项
14
  res = re.search(
 
40
  def extract_choice(gen):
41
  # answer is A | choice is A | choose A
42
  res = re.search(
43
+ r"(?:(?:[Cc]hoose)|(?:(?:[Aa]nswer|[Cc]hoice)(?![^ABCD]{0,20}?(?:n't|not))[^ABCD]{0,10}?\b(?:|is|:|be))\b)[^ABCD]{0,20}?\b(A|B|C|D)\b",
44
  gen,
45
  )
46
 
47
  # A is correct | A is right
48
  if res is None:
49
  res = re.search(
50
+ r"\b(A|B|C|D)\b(?![^ABCD]{0,8}?(?:n't|not)[^ABCD]{0,5}?(?:correct|right))[^ABCD]{0,10}?\b(?:correct|right)\b",
51
  gen,
52
  )
53
 
54
  # straight answer: A
55
  if res is None:
56
+ res = re.search(r"^(A|B|C|D)(?:\.|,|:|$)", gen)
57
 
58
  # simply extract the first appearred letter
59
  if res is None:
60
+ res = re.search(r"(?<![a-zA-Z])(A|B|C|D)(?![a-zA-Z=])", gen)
61
 
62
  if res is None:
63
+ res = "A"
64
 
65
  if isinstance(res, str):
66
  return res
 
102
 
103
 
104
  def remove_boxed(s):
105
+ if "\\boxed " in s:
106
+ left = "\\boxed "
107
+ assert s[: len(left)] == left
108
+ return s[len(left) :]
109
+
110
+ left = "\\boxed{"
111
+
112
+ assert s[: len(left)] == left
113
+ assert s[-1] == "}"
114
 
115
+ return s[len(left) : -1]
 
 
 
 
 
116
 
117
 
118
  def last_boxed_only_string(string):
 
145
  return retval
146
 
147
 
148
+ def fix_sqrt(string):
149
+ if "\\sqrt" not in string:
150
+ return string
151
+ splits = string.split("\\sqrt")
152
+ new_string = splits[0]
153
+ for split in splits[1:]:
154
+ if split[0] != "{":
155
+ a = split[0]
156
+ new_substr = "\\sqrt{" + a + "}" + split[1:]
157
+ else:
158
+ new_substr = "\\sqrt" + split
159
+ new_string += new_substr
160
+ return new_string
161
+
162
+
163
+ def remove_right_units(string):
164
+ # "\\text{ " only ever occurs (at least in the val set) when describing units
165
+ if "\\text{ " in string:
166
+ splits = string.split("\\text{ ")
167
+ assert len(splits) == 2
168
+ return splits[0]
169
  else:
170
+ return string
171
+
172
+
173
+ def fix_fracs(string):
174
+ substrs = string.split("\\frac")
175
+ new_str = substrs[0]
176
+ if len(substrs) > 1:
177
+ substrs = substrs[1:]
178
+ for substr in substrs:
179
+ new_str += "\\frac"
180
+ if substr[0] == "{":
181
+ new_str += substr
182
+ else:
183
+ try:
184
+ assert len(substr) >= 2
185
+ except AssertionError:
186
+ return string
187
+ a = substr[0]
188
+ b = substr[1]
189
+ if b != "{":
190
+ if len(substr) > 2:
191
+ post_substr = substr[2:]
192
+ new_str += "{" + a + "}{" + b + "}" + post_substr
193
+ else:
194
+ new_str += "{" + a + "}{" + b + "}"
195
+ else:
196
+ if len(substr) > 2:
197
+ post_substr = substr[2:]
198
+ new_str += "{" + a + "}" + b + post_substr
199
+ else:
200
+ new_str += "{" + a + "}" + b
201
+ string = new_str
202
+ return string
203
+
204
+
205
+ def fix_a_slash_b(string):
206
+ if len(string.split("/")) != 2:
207
+ return string
208
+ a = string.split("/")[0]
209
+ b = string.split("/")[1]
210
+ try:
211
+ a = int(a)
212
+ b = int(b)
213
+ assert string == "{}/{}".format(a, b)
214
+ new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
215
+ return new_string
216
+ except Exception as e:
217
+ return string
218
+
219
+
220
+ def strip_string(string):
221
+ # linebreaks
222
+ string = string.replace("\n", "")
223
+
224
+ # remove inverse spaces
225
+ string = string.replace("\\!", "")
226
+
227
+ # replace \\ with \
228
+ string = string.replace("\\\\", "\\")
229
+
230
+ # replace tfrac and dfrac with frac
231
+ string = string.replace("tfrac", "frac")
232
+ string = string.replace("dfrac", "frac")
233
+
234
+ # remove \left and \right
235
+ string = string.replace("\\left", "")
236
+ string = string.replace("\\right", "")
237
+
238
+ # Remove circ (degrees)
239
+ string = string.replace("^{\\circ}", "")
240
+ string = string.replace("^\\circ", "")
241
 
242
+ # remove dollar signs
243
+ string = string.replace("\\$", "")
244
+
245
+ # remove units (on the right)
246
+ string = remove_right_units(string)
247
+
248
+ # remove percentage
249
+ string = string.replace("\\%", "")
250
+ string = string.replace("\%", "") # noqa: W605
251
+
252
+ # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
253
+ string = string.replace(" .", " 0.")
254
+ string = string.replace("{.", "{0.")
255
+ # if empty, return empty string
256
+ if len(string) == 0:
257
+ return string
258
+ if string[0] == ".":
259
+ string = "0" + string
260
+
261
+ # to consider: get rid of e.g. "k = " or "q = " at beginning
262
+ if len(string.split("=")) == 2:
263
+ if len(string.split("=")[0]) <= 2:
264
+ string = string.split("=")[1]
265
+
266
+ # fix sqrt3 --> sqrt{3}
267
+ string = fix_sqrt(string)
268
+
269
+ # remove spaces
270
+ string = string.replace(" ", "")
271
+
272
+ # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
273
+ string = fix_fracs(string)
274
+
275
+ # manually change 0.5 --> \frac{1}{2}
276
+ if string == "0.5":
277
+ string = "\\frac{1}{2}"
278
+
279
+ # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
280
+ # string = fix_a_slash_b(string)
281
+
282
+ return string
283
+
284
+
285
+ def get_answer(string):
286
+ try:
287
+ answer = remove_boxed(last_boxed_only_string(string))
288
+ # answer = strip_string(answer)
289
+ except Exception:
290
+ answer = string
291
+ return answer
292
+
293
+
294
+ def is_equiv(str1, str2, verbose=False):
295
+ if str1 is None and str2 is None:
296
+ print("WARNING: Both None")
297
+ return False
298
+ if str1 is None or str2 is None:
299
+ return False
300
+
301
+ try:
302
+ ss1 = strip_string(str1)
303
+ ss2 = strip_string(str2)
304
+ if verbose:
305
+ print(ss1, ss2)
306
+ return ss1 == ss2
307
+ except Exception:
308
+ return str1 == str2
309
 
310
  def first_option_postprocess(text: str, options: str) -> str:
311
  """Find first valid option for text."""
312
 
313
  patterns = [
314
+ f'[Tt]he answer is [{options}]',
315
+ f'[Tt]he correct answer\s?(?:option)?\s?is [{options}]', # noqa
316
+ f'答案(?:选项)?是(.*?)[{options}]',
317
+ f'答案(?:选项)?为(.*?)[{options}]',
318
+ f'答案(?:选项)?选(.*?)[{options}]',
319
+ f'选项[{options}]是?正确',
320
+ f'选项[{options}]为?正确',
321
+ f'固选(.*?)[{options}]',
322
+ f'答案应该是(.*?)[{options}]',
323
+ f'(\s|^)[{options}][\s。,,\.$]', # noqa
324
+ f'[{options}]',
325
  ]
326
 
327
  regexes = [re.compile(pattern) for pattern in patterns]
 
332
  for i in options:
333
  if i in outputs:
334
  return i
335
+ return ''
336
 
337
 
338
  def first_capital_postprocess(text: str) -> str:
339
  for t in text:
340
  if t.isupper():
341
  return t
342
+ return ''
343
 
344
 
345
  def general_postprocess(text: str) -> str:
346
  # Cut off the first newline, period, or comma
347
+ truncated_text = re.split(r'[\n.,]', text, 1)[0]
348
 
349
  # Remove punctuation
350
+ no_punctuation = re.sub(r'[^\w\s]', '', truncated_text)
351
 
352
  # Remove article
353
+ no_articles = re.sub(r'\b(a|an|the)\b',
354
+ '',
355
+ no_punctuation,
356
+ flags=re.IGNORECASE)
357
 
358
  # Remove duplicated blank spaces
359
+ cleaned_text = re.sub(r'\s+', ' ', no_articles).strip()
360
 
361
  return cleaned_text
362
 
363
+
364
  def bbh_mcq_postprocess(text: str) -> str:
365
  ans = text
366
+ ans_line = ans.split('answer is ')
367
  if len(ans_line) != 1:
368
  ans = ans_line[-1].strip()
369
+ match = re.search(r'\(([A-Z])\)*', ans)
370
  if match:
371
  return f"({match.group(1)})"
372
+ match = re.search(r'([A-Z])', ans)
373
  if match:
374
  return f"({match.group(1)})"
375
  return f"({ans})"
 
377
 
378
  def bbh_freeform_postprocess(text: str) -> str:
379
  ans = text
380
+ ans_line = ans.split('answer is ')
381
  if len(ans_line) != 1:
382
  ans = ans_line[-1].strip()
383
+ ans = ans.split('\n')[0]
384
+ if ans.endswith('.'):
385
  ans = ans[:-1]
386
  return ans
387
 
388
 
389
+
390
  icl_prompts = {
391
+ 'temporal_sequences': """Task description: Answer questions about which times certain events could have occurred.
392
 
393
  Q: Today, Emily went to the museum. Between what times could they have gone?
394
  We know that:
 
465
  3pm-4pm: free.
466
  The beach closure time: 4pm.
467
  The only time when Tiffany could have gone to the beach was 3pm to 4pm. So the answer is (D).""",
468
+
469
+
470
+ 'disambiguation_qa': """Clarify the meaning of sentences with ambiguous pronouns.
471
 
472
  Q: In the following sentences, explain the antecedent of the pronoun (which thing the pronoun refers to), or state that it is ambiguous.
473
  Sentence: The chief told the counselor that they took the day off.
 
504
  Let's consider Y first: "X will plan to meet Y at Y's office." This case makes sense, because X might want to meet up with Y at Y's office.
505
  Now, consider X: "X will plan to meet Y at X's office." This case also makes sense, because X might want to meet up with Y at X's own office.
506
  Because both X and Y are possible at the same time, we conclude that the antecedent of the pronoun is ambiguous. So the answer is (C).""",
507
+
508
+
509
+ 'date_understanding': """Infer the date from context.
510
 
511
  Q: Today is Christmas Eve of 1937. What is the date 10 days ago in MM/DD/YYYY?
512
  Options:
 
539
  (F) 12/03/1960
540
  A: Let's think step by step.
541
  If Jane and John married on Jan 2, 1958, then and if it is their 5-year anniversary today, then today's date is Jan 2, 1963. The date tomorrow is Jan 3, 1963, that is 01/03/1963. So the answer is (B).""",
542
+
543
+
544
+ 'tracking_shuffled_objects_three_objects': """A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.
545
 
546
  Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.
547
  As the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the
 
581
  (2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
582
  (3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
583
  At the end of the dance, Alice is dancing with Patrick. So the answer is (C).""",
584
+
585
+
586
+ 'penguins_in_a_table': """Answer questions about a table of penguins and their attributes.
587
 
588
  Q: Here is a table where the first line is a header and each subsequent line is a penguin: name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15 For example: the age of Louis is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm. We now add a penguin to the table:
589
  James, 12, 90, 12
 
624
  When we sort their names alphabetically, we get Bernard, Gwen, Louis, Vincent.
625
  The name of the second penguin sorted by alphabetical order is Gwen.
626
  The name of the second penguin sorted by alphabetic order is Gwen. So the answer is (D).""",
627
+
628
+
629
+ 'geometric_shapes': """Name geometric shapes from their SVG paths.
630
 
631
  Q: This SVG path element <path d="M 31.00,73.00 L 32.00,59.00 L 44.00,50.00 L 49.00,41.00 L 64.00,37.00 L 71.00,55.00 L 64.00,76.00 L 52.00,61.00 L 31.00,73.00"/> draws a
632
  Options:
 
704
  Length of side C: |C| = sqrt((41.00-45.00)^2 + (33.00-34.00)^2)) = sqrt((-4)^2 + (-1)^2) = sqrt(16 + 1) = sqrt(17).
705
  Length of side D: |D| = sqrt((45.00-41.00)^2 + (34.00-43.00)^2)) = sqrt((4)^2 + (-9)^2) = sqrt(16 + 81) = sqrt(97).
706
  Note that |A| = |D| and |B| = |C|. Furthermore, A and D are adjacent and B and C are adjacent. Thus, this polygon has two pairs of equal-length adjacent sides and is "kite". So the answer is (D).""",
707
+
708
+
709
+ 'snarks': """Determine which of two sentences is sarcastic.
710
 
711
  According to Cambridge University Dictionary, sarcasm is "the use of remarks that clearly mean the opposite of what they say, made in order to hurt someone's feelings or to criticize something in a humorous way." Sarcastic sentences often contain satirical or ironic utterances, hyperboles, ambivalent or witty remarks.
712
 
 
736
  If we look at (A), it likens the consistency in the league's punishments with that in politics. Because politics or political affairs are often not considered to be consistent or dependable, this sentence appears to be satirical.
737
  If we look at (B), it likens the consistency in the league's punishments with that in morality. Discussing the consistency of the league's punishments in the context of morality, ethics, or law makes sense and does not appear to make a satirical point about anything.
738
  Above the above, the sarcastic option is (A). So the answer is (A).""",
739
+
740
+
741
+ 'ruin_names': """Select the humorous edit that 'ruins' the input movie or musical artist name.
742
 
743
  Q: Which of the following is a humorous edit of this artist or movie name: 'whitesnake'?
744
  Options:
 
782
  (C) "courting crows": Here the word "counting" is changed to "courting", and "courting" is an actual word; however, "courting crows" does not sound as humorous as "counting cows".
783
  (D) "coutnting crows": Here the word "counting" is changed to "coutnting", but the word "coutnting" is not an actual word.
784
  Above the above, the only humorous edit is (B). So the answer is (B).""",
785
+
786
+
787
+ 'tracking_shuffled_objects_seven_objects': """A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.
788
 
789
  Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.
790
  As the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the
 
824
  (2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
825
  (3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
826
  At the end of the dance, Alice is dancing with Patrick. So the answer is (C).""",
827
+
828
+
829
+ 'tracking_shuffled_objects_five_objects': """A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.
830
 
831
  Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.
832
  As the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the
 
866
  (2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
867
  (3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
868
  At the end of the dance, Alice is dancing with Patrick. So the answer is (C).""",
869
+
870
+
871
+ 'logical_deduction_three_objects': """A logical deduction task which requires deducing the order of a sequence of objects.
872
 
873
  Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.
874
  Options:
 
905
  (3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
906
  According to this ordering, the leftmost book is the white book.
907
  The white book is the leftmost. So the answer is (C).""",
908
+
909
+
910
+ 'hyperbaton': """Order adjectives correctly in English sentences.
911
 
912
  Q: Which sentence has the correct adjective order:
913
  Options:
 
935
  When there is more than one adjective before a noun, the adjectives need to respect the following order before a noun: "[1. opinion] [2. size] [3. age] [4. shape] [5. color] [6. origin] [7. material] [8. purpose] noun".
936
  Option (A): "blue gold wonderful square shoe". (1) "blue" falls into the color category. (2) "gold" falls into the material category. (3) "wonderful" falls into the opinion category. (4) "square" falls into the shape category. The adjective order that Option (A) has is [5. color] [7. material] [1. opinion] [4. shape] (or, in numeric terms, 5 7 1 4). Because 5 < 7 < 1 < 4 is not correct, (A) does not have the correct ordering.
937
  Option (B): "wonderful square blue gold shoe". Option (B) has the following adjective order: [1. opinion] [4. shape] [5. color] [7. material] (or, in numeric terms, 1 4 5 7 ). Because 1 < 4 < 5 < 7 is correct, (B) has the correct ordering. So the answer is (B).""",
938
+
939
+
940
+ 'logical_deduction_five_objects': """A logical deduction task which requires deducing the order of a sequence of objects.
941
 
942
  Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.
943
  Options:
 
974
  (3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
975
  According to this ordering, the leftmost book is the white book.
976
  The white book is the leftmost. So the answer is (C).""",
977
+
978
+
979
+ 'logical_deduction_seven_objects': """A logical deduction task which requires deducing the order of a sequence of objects.
980
 
981
  Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.
982
  Options:
 
1013
  (3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
1014
  According to this ordering, the leftmost book is the white book.
1015
  The white book is the leftmost. So the answer is (C).""",
1016
+
1017
+
1018
+ 'movie_recommendation': """Recommend movies similar to the given list of movies.
1019
 
1020
  Q: Find a movie similar to Star Wars Episode IV - A New Hope, Indiana Jones and the Last Crusade, Star Wars Episode V - The Empire Strikes Back, The Big Lebowski:
1021
  Options:
 
1057
  - Inside Out (animation, family, comedy; 2015)
1058
  - Forrest Gump (comedy, drama, romance; 1994)
1059
  These are all famous movies produced in the past few decades.Amongst all the options, the only movie similar to these ones seems to be Edge of Tomorrow (action, adventure, crime, mystery; 2014), as it is also a science-fiction movie and features Tom Cruise. So the answer is (D).""",
1060
+
1061
+
1062
+ 'salient_translation_error_detection': """Detect the type of error in an English translation of a German source sentence.
1063
 
1064
  Q: The following translations from German to English contain a particular error. That error will be one of the following types: Named Entities: An entity (names, places, locations, etc.) is changed to a different entity. Numerical Values: Numerical values (ordinals or cardinals), dates, and/or units are changed. Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are changed. Negation or Antonyms: Introduce or remove a negation or change comparatives to their antonyms. Facts: Trivial factual errors not pertaining to the above classes are introduced in the translations. Dropped Content: A significant clause in the translation is removed. Please identify that error. Source: In der Liste der Baudenkmale in Lenzen (Elbe) sind alle Baudenkmale der brandenburgischen Stadt Lenzen (Elbe) und ihrer Ortsteile aufgelistet.
1065
  Translation: In the list of architectural monuments in Lenzen all architectural monuments of the Brandenburg city of Lenzen and its districts are listed.
 
1099
  (F) Facts
1100
  A: Let's think step by step.
1101
  We solve this question by first translating the source sentence to English and then by comparing our translation with the provided translation. According to Google Translate, the correct translation of the source sentence from German to English is "Łeba is a small town and seaside resort in the Powiat Lęborski of the Polish Pomeranian Voivodeship." On the other hand, the provided translation is "Łeba is not a small town and seaside resort in the Powiat Léborski county of the Pomeranian Voivodeship of Poland." Note that the provided sentence says, "Łeba is not a small town ..." However, the translation should have been "Łeba is a small town ..." Because a negation is introduced at the beginning of the sentence and has fundamentally changed the meaning of the original source, the translation contains an error pertaining to Negation or Antonyms. So the answer is (C).""",
1102
+
1103
+
1104
+ 'reasoning_about_colored_objects': """Answer extremely simple questions about the colors of objects on a surface.
1105
 
1106
  Q: On the nightstand, there is a red pencil, a purple mug, a burgundy keychain, a fuchsia teddy bear, a black plate, and a blue stress ball. What color is the stress ball?
1107
  Options:
 
1164
  According to this question, the objects are arranged in a row, from left to right, as follows: (1) a teal plate, (2) a burgundy keychain, (3) a yellow scrunchiephone charger, (4) an orange mug, (5) a pink notebook, (6) a grey cup.
1165
  The teal plate is the first item, namely (1). There is no item to the left of the teal item.
1166
  The number of non-orange items to the left of the teal item is zero. So the answer is (A).""",
1167
+
1168
+ 'multistep_arithmetic_two': """Solve multi-step arithmetic problems.
1169
 
1170
  Q: ((-5 + 9 * -4 - 0) * (4 + -7 + 0 * -5)) =
1171
  A: Let's think step by step.
 
1190
  Let's calculate A = (-3 + 5 * 8 * -4) = (-3 + (5 * 8) * -4) = (-3 + (40) * -4) = (-3 + (40 * -4)) = (-3 + -160) = -163.
1191
  Let's calculate B = (9 - 8 * -7 + -9) = (9 - (8 * -7) + -9) = (9 - (-56) + -9) = ((9 - (-56)) + -9) = ((65) + -9)= (65 - 9) = 56.
1192
  Then, the final equation is A - B = -163 - 56 = -219. So the answer is -219.""",
1193
+
1194
+
1195
+ 'navigate': """Given a series of navigation instructions, determine whether one would end up back at the starting point.
1196
 
1197
  Q: If you follow these instructions, do you return to the starting point? Turn left. Turn around. Turn left. Take 7 steps. Take 2 steps. Take 4 steps. Take 8 steps.
1198
  Options:
 
1235
  (3) Take 9 steps left: (0, -7), facing the positive y-axis.
1236
  (4) Take 7 steps right: (0, 7), facing the positive y-axis.
1237
  Since (0, 0) is (0, 0), we are indeed where we started. So the answer is Yes.""",
1238
+
1239
+
1240
+ 'dyck_languages': """Correctly close a Dyck-n word.
1241
 
1242
  Q: Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: [ { [
1243
  A: Let's think step by step.
 
1309
  Now, we have reached the end. The final stack is "< [ {".
1310
  We will need to pop out "{", "[", "<" one by one in that order.
1311
  So, we need "}", "]", ">". So the answer is } ] >.""",
1312
+
1313
+
1314
+ 'word_sorting': """Sort a list of words.
1315
 
1316
  Q: Sort the following words alphabetically: List: oven costume counterpart
1317
  A: Let's think step by step.
 
1328
  A: Let's think step by step.
1329
  The first letter: "newt": "n" (14). "arson": "a" (1). "parthia": "p" (16). "seismography": "s" (19). "mugho": "m" (13). "aspect": "a" (1). "census": "c" (3). We now have: (1) ["arson" ? "aspect"] < (3) "census" < (13) "mugho" < (14) "newt" < (16) "parthia" < (19) "seismography". Now let's sort this subpart ["arson" ? "aspect"] by looking at their second letters.
1330
  The second letter: "arson": "r" (18). "aspect": "s" (19). We now have: (18) "arson" < (19) "aspect". Hence, we have ["arson" < "aspect"] < "census" < "mugho" < "newt" < "parthia" < "seismography". So the answer is arson aspect census mugho newt parthia seismography.""",
1331
+
1332
+
1333
+ 'sports_understanding': """Determine whether an artificially constructed sentence relating to sports is plausible or not.
1334
 
1335
  Q: Is the following sentence plausible? "Bam Adebayo scored a reverse layup in the Western Conference Finals."
1336
  A: Let's think step by step. Bam Adebayo is an American basketball player. Scoring a reverse layup in the Western Conference Finals is part of the NBA Finals. So the answer is yes.
 
1340
 
1341
  Q: Is the following sentence plausible? "DeMar DeRozan was called for the goal tend."
1342
  A: Let's think step by step. DeMar DeRozan is an American basketball player. Goal tending is part of basketball. So the answer is yes.""",
1343
+
1344
+
1345
+ 'boolean_expressions': """Evaluate the result of a random Boolean expression.
1346
 
1347
  Q: not ( ( not not True ) ) is
1348
  A: Let's think step by step.
 
1365
  We first simplify this expression "Z" as follows: "Z = not not ( not ( False ) ) = not not ( A )" where "A = not ( False )".
1366
  Let's evaluate A: A = not ( False ) = not False = True.
1367
  Plugging in A, we get: Z = not not ( A ) = not not (True) = not not False = True. So the answer is True.""",
1368
+
1369
+
1370
+ 'object_counting': """Questions that involve enumerating objects and asking the model to count them.
1371
 
1372
  Q: I have a blackberry, a clarinet, a nectarine, a plum, a strawberry, a banana, a flute, an orange, and a violin. How many fruits do I have?
1373
  A: Let's think step by step.
 
1404
  - garlic (1)
1405
  - yam (1)
1406
  Now, let's add the numbers in parentheses: 1 + 1 + 1 + 1 + 2 + 1 + 1 = 8. So the answer is 8.""",
1407
+
1408
+
1409
+ 'formal_fallacies': """Distinguish deductively valid arguments from formal fallacies.
1410
 
1411
  Q: "It is not always easy to see who is related to whom -- and in which ways. The following argument pertains to this question: To begin with, Lesley is a close friend of Fernando. Moreover, being a close friend of Fernando or a schoolmate of Lowell is sufficient for being a great-grandfather of Leroy. It follows that Lesley is a great-grandfather of Leroy."
1412
  Is the argument, given the explicitly stated premises, deductively valid or invalid?
 
1450
  The case X = rare-consumer(Nioxin) does not appear in (2).
1451
  The case X = loyal-buyer(Caress) does not appear in (2), either.
1452
  So, from (1) and (2), we cannot necessarily deduce the Hypothesis. So the answer is invalid.""",
1453
+
1454
+
1455
+ 'causal_judgement': """Answer questions about causal attribution.
1456
 
1457
  Q: How would a typical person answer each of the following questions about causation?
1458
  Frank T., had an ongoing dispute with his neighbor over a stretch of land and one day decided to shoot his neighbor in the body. Frank T. had no experience with guns, his hand slipped on the barrel of the gun, and the shot went wild. Nonetheless, the bullet bounced off a large boulder several feet away and hit the neighbor's body, causing significant injury. Did Frank T. intentionally shoot his neighbor in the body?
 
1477
  - No
1478
  A: Let's think step by step.
1479
  Here in this question, we are told that "He aims the dart at the low point region." A typical person might therefore think George did intentionally hit the low point region, because he wanted to lift up the spirit of his sister Lena. So the answer is Yes.""",
1480
+ 'web_of_lies': """Evaluate a random boolean function expressed as a word problem.
1481
 
1482
  Q: Question: Fidel tells the truth. Jerry says Fidel tells the truth. Vina says Jerry tells the truth. Millicent says Vina lies. Raymond says Millicent lies. Does Raymond tell the truth?
1483
  A: Let's think step by step.