Weyaxi commited on
Commit
1148fdd
1 Parent(s): 9a140a8
Files changed (1) hide show
  1. functions.py +45 -45
functions.py CHANGED
@@ -38,69 +38,69 @@ def get_query_url(repo):
38
 
39
  def get_task_summary(results):
40
  return {
41
- "ARC":
42
- {"dataset_type":"ai2_arc",
43
- "dataset_name":"AI2 Reasoning Challenge (25-Shot)",
44
- "metric_type":"acc_norm",
45
- "metric_value":results["ARC"],
46
- "dataset_config":"ARC-Challenge",
47
- "dataset_split":"test",
48
  "dataset_revision":None,
49
- "dataset_args":{"num_few_shot": 25},
50
- "metric_name":"normalized accuracy"
51
  },
52
- "HellaSwag":
53
- {"dataset_type":"hellaswag",
54
- "dataset_name":"HellaSwag (10-Shot)",
55
  "metric_type":"acc_norm",
56
- "metric_value":results["HellaSwag"],
57
- "dataset_config":None,
58
- "dataset_split":"validation",
59
  "dataset_revision":None,
60
- "dataset_args":{"num_few_shot": 10},
61
  "metric_name":"normalized accuracy"
62
  },
63
- "MMLU":
64
  {
65
- "dataset_type":"cais/mmlu",
66
- "dataset_name":"MMLU (5-Shot)",
67
- "metric_type":"acc",
68
- "metric_value":results["MMLU"],
69
- "dataset_config":"all",
70
- "dataset_split":"test",
71
  "dataset_revision":None,
72
- "dataset_args":{"num_few_shot": 5},
73
- "metric_name":"accuracy"
74
  },
75
- "TruthfulQA":
76
  {
77
- "dataset_type":"truthful_qa",
78
- "dataset_name":"TruthfulQA (0-shot)",
79
- "metric_type":"mc2",
80
- "metric_value":results["TruthfulQA"],
81
- "dataset_config":"multiple_choice",
82
- "dataset_split":"validation",
83
  "dataset_revision":None,
84
  "dataset_args":{"num_few_shot": 0},
85
- "metric_name":None
86
  },
87
- "Winogrande":
88
  {
89
- "dataset_type":"winogrande",
90
  "dataset_name":"Winogrande (5-shot)",
91
- "metric_type":"acc",
92
- "metric_value":results["Winogrande"],
93
- "dataset_config":"winogrande_xl",
94
- "dataset_split":"validation",
95
- "dataset_args":{"num_few_shot": 5},
96
- "metric_name":"accuracy"
97
  },
98
- "GSM8K":
99
  {
100
- "dataset_type":"gsm8k",
101
- "dataset_name":"GSM8k (5-shot)",
102
  "metric_type":"acc",
103
- "metric_value":results["GSM8K"],
104
  "dataset_config":"main",
105
  "dataset_split":"test",
106
  "dataset_args":{"num_few_shot": 5},
 
38
 
39
  def get_task_summary(results):
40
  return {
41
+ "IFEval":
42
+ {"dataset_type":"HuggingFaceH4/ifeval",
43
+ "dataset_name":"IFEval (0-Shot)",
44
+ "metric_type":"inst_level_strict_acc",
45
+ "metric_value":results["IFEval"],
46
+ "dataset_config": None, # don't know
47
+ "dataset_split": None, # don't know
48
  "dataset_revision":None,
49
+ "dataset_args":{"num_few_shot": 0},
50
+ "metric_name":"strict accuracy"
51
  },
52
+ "BBH":
53
+ {"dataset_type":"BBH",
54
+ "dataset_name":"BBH (3-Shot)",
55
  "metric_type":"acc_norm",
56
+ "metric_value":results["BBH"],
57
+ "dataset_config": None, # don't know
58
+ "dataset_split": None, # don't know
59
  "dataset_revision":None,
60
+ "dataset_args":{"num_few_shot": 3},
61
  "metric_name":"normalized accuracy"
62
  },
63
+ "MATH Lvl 5":
64
  {
65
+ "dataset_type":"hendrycks/competition_math",
66
+ "dataset_name":"MATH Lvl 5 (4-Shot)",
67
+ "metric_type":"exact_match",
68
+ "metric_value":results["MATH Lvl 5"],
69
+ "dataset_config": None, # don't know
70
+ "dataset_split": None, # don't know
71
  "dataset_revision":None,
72
+ "dataset_args":{"num_few_shot": 4},
73
+ "metric_name":"exact match"
74
  },
75
+ "GPQA":
76
  {
77
+ "dataset_type":"Idavidrein/gpqa",
78
+ "dataset_name":"GPQA (0-shot)",
79
+ "metric_type":"acc_norm",
80
+ "metric_value":results["GPQA"],
81
+ "dataset_config": None, # don't know
82
+ "dataset_split": None, # don't know
83
  "dataset_revision":None,
84
  "dataset_args":{"num_few_shot": 0},
85
+ "metric_name":"acc_norm"
86
  },
87
+ "MUSR":
88
  {
89
+ "dataset_type":"TAUR-Lab/MuSR",
90
  "dataset_name":"Winogrande (5-shot)",
91
+ "metric_type":"acc_norm",
92
+ "metric_value":results["MUSR"],
93
+ "dataset_config": None, # don't know
94
+ "dataset_split": None, # don't know
95
+ "dataset_args":{"num_few_shot": 0},
96
+ "metric_name":"acc_norm"
97
  },
98
+ "MMLU-PRO":
99
  {
100
+ "dataset_type":"TIGER-Lab/MMLU-Pro",
101
+ "dataset_name":"MMLU-PRO (5-shot)",
102
  "metric_type":"acc",
103
+ "metric_value":results["MMLU-PRO"],
104
  "dataset_config":"main",
105
  "dataset_split":"test",
106
  "dataset_args":{"num_few_shot": 5},