AdamLucek commited on
Commit
5633370
·
verified ·
1 Parent(s): a0cb545

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +4 -4
README.md CHANGED
@@ -77,12 +77,12 @@ print(outputs[0]["generated_text"])
77
  </div>
78
  </div>
79
 
80
- ## OpenLLM Leaderboard Metrics (WIP)
81
 
82
  | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
83
  |-----------------------------------------------------------|-------|------|-----:|-----------------------|---|-----:|---|------|
84
  |leaderboard | N/A| | | | | | | |
85
- | - leaderboard_bbh | N/A| | | | | | | |
86
  | - leaderboard_bbh_boolean_expressions | 1|none | 3|acc_norm |↑ |0.6840|± |0.0295|
87
  | - leaderboard_bbh_causal_judgement | 1|none | 3|acc_norm |↑ |0.5134|± |0.0366|
88
  | - leaderboard_bbh_date_understanding | 1|none | 3|acc_norm |↑ |0.1920|± |0.0250|
@@ -107,7 +107,7 @@ print(outputs[0]["generated_text"])
107
  | - leaderboard_bbh_tracking_shuffled_objects_seven_objects| 1|none | 3|acc_norm |↑ |0.1400|± |0.0220|
108
  | - leaderboard_bbh_tracking_shuffled_objects_three_objects| 1|none | 3|acc_norm |↑ |0.3520|± |0.0303|
109
  | - leaderboard_bbh_web_of_lies | 1|none | 3|acc_norm |↑ |0.4880|± |0.0317|
110
- | - leaderboard_gpqa | N/A| | | | | | | |
111
  | - leaderboard_gpqa_diamond | 1|none | 0|acc_norm |↑ |0.2576|± |0.0312|
112
  | - leaderboard_gpqa_extended | 1|none | 0|acc_norm |↑ |0.2436|± |0.0184|
113
  | - leaderboard_gpqa_main | 1|none | 0|acc_norm |↑ |0.2433|± |0.0203|
@@ -124,7 +124,7 @@ print(outputs[0]["generated_text"])
124
  | - leaderboard_math_prealgebra_hard | 2|none | 4|exact_match |↑ |0.0000|± | 0|
125
  | - leaderboard_math_precalculus_hard | 2|none | 4|exact_match |↑ |0.0000|± | 0|
126
  | - leaderboard_mmlu_pro | 0.1|none | 5|acc |↑ |0.1222|± |0.0030|
127
- | - leaderboard_musr | N/A| | | | | | | |
128
  | - leaderboard_musr_murder_mysteries | 1|none | 0|acc_norm |↑ |0.5120|± |0.0317|
129
  | - leaderboard_musr_object_placements | 1|none | 0|acc_norm |↑ |0.2500|± |0.0271|
130
  | - leaderboard_musr_team_allocation | 1|none | 0|acc_norm |↑ |0.2680|± |0.0281|
 
77
  </div>
78
  </div>
79
 
80
+ ## OpenLLM Leaderboard Metrics
81
 
82
  | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
83
  |-----------------------------------------------------------|-------|------|-----:|-----------------------|---|-----:|---|------|
84
  |leaderboard | N/A| | | | | | | |
85
+ | - leaderboard_bbh | N/A| | | | |0.3290| | |
86
  | - leaderboard_bbh_boolean_expressions | 1|none | 3|acc_norm |↑ |0.6840|± |0.0295|
87
  | - leaderboard_bbh_causal_judgement | 1|none | 3|acc_norm |↑ |0.5134|± |0.0366|
88
  | - leaderboard_bbh_date_understanding | 1|none | 3|acc_norm |↑ |0.1920|± |0.0250|
 
107
  | - leaderboard_bbh_tracking_shuffled_objects_seven_objects| 1|none | 3|acc_norm |↑ |0.1400|± |0.0220|
108
  | - leaderboard_bbh_tracking_shuffled_objects_three_objects| 1|none | 3|acc_norm |↑ |0.3520|± |0.0303|
109
  | - leaderboard_bbh_web_of_lies | 1|none | 3|acc_norm |↑ |0.4880|± |0.0317|
110
+ | - leaderboard_gpqa | N/A| | | | |0.2482| | |
111
  | - leaderboard_gpqa_diamond | 1|none | 0|acc_norm |↑ |0.2576|± |0.0312|
112
  | - leaderboard_gpqa_extended | 1|none | 0|acc_norm |↑ |0.2436|± |0.0184|
113
  | - leaderboard_gpqa_main | 1|none | 0|acc_norm |↑ |0.2433|± |0.0203|
 
124
  | - leaderboard_math_prealgebra_hard | 2|none | 4|exact_match |↑ |0.0000|± | 0|
125
  | - leaderboard_math_precalculus_hard | 2|none | 4|exact_match |↑ |0.0000|± | 0|
126
  | - leaderboard_mmlu_pro | 0.1|none | 5|acc |↑ |0.1222|± |0.0030|
127
+ | - leaderboard_musr | N/A| | |avg acc_norm | |0.3433| | |
128
  | - leaderboard_musr_murder_mysteries | 1|none | 0|acc_norm |↑ |0.5120|± |0.0317|
129
  | - leaderboard_musr_object_placements | 1|none | 0|acc_norm |↑ |0.2500|± |0.0271|
130
  | - leaderboard_musr_team_allocation | 1|none | 0|acc_norm |↑ |0.2680|± |0.0281|