Update README.md
Browse files
README.md
CHANGED
@@ -77,12 +77,12 @@ print(outputs[0]["generated_text"])
|
|
77 |
</div>
|
78 |
</div>
|
79 |
|
80 |
-
## OpenLLM Leaderboard Metrics
|
81 |
|
82 |
| Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
|
83 |
|-----------------------------------------------------------|-------|------|-----:|-----------------------|---|-----:|---|------|
|
84 |
|leaderboard | N/A| | | | | | | |
|
85 |
-
| - leaderboard_bbh | N/A| | | | |
|
86 |
| - leaderboard_bbh_boolean_expressions | 1|none | 3|acc_norm |↑ |0.6840|± |0.0295|
|
87 |
| - leaderboard_bbh_causal_judgement | 1|none | 3|acc_norm |↑ |0.5134|± |0.0366|
|
88 |
| - leaderboard_bbh_date_understanding | 1|none | 3|acc_norm |↑ |0.1920|± |0.0250|
|
@@ -107,7 +107,7 @@ print(outputs[0]["generated_text"])
|
|
107 |
| - leaderboard_bbh_tracking_shuffled_objects_seven_objects| 1|none | 3|acc_norm |↑ |0.1400|± |0.0220|
|
108 |
| - leaderboard_bbh_tracking_shuffled_objects_three_objects| 1|none | 3|acc_norm |↑ |0.3520|± |0.0303|
|
109 |
| - leaderboard_bbh_web_of_lies | 1|none | 3|acc_norm |↑ |0.4880|± |0.0317|
|
110 |
-
| - leaderboard_gpqa | N/A| | | | |
|
111 |
| - leaderboard_gpqa_diamond | 1|none | 0|acc_norm |↑ |0.2576|± |0.0312|
|
112 |
| - leaderboard_gpqa_extended | 1|none | 0|acc_norm |↑ |0.2436|± |0.0184|
|
113 |
| - leaderboard_gpqa_main | 1|none | 0|acc_norm |↑ |0.2433|± |0.0203|
|
@@ -124,7 +124,7 @@ print(outputs[0]["generated_text"])
|
|
124 |
| - leaderboard_math_prealgebra_hard | 2|none | 4|exact_match |↑ |0.0000|± | 0|
|
125 |
| - leaderboard_math_precalculus_hard | 2|none | 4|exact_match |↑ |0.0000|± | 0|
|
126 |
| - leaderboard_mmlu_pro | 0.1|none | 5|acc |↑ |0.1222|± |0.0030|
|
127 |
-
| - leaderboard_musr | N/A| | |
|
128 |
| - leaderboard_musr_murder_mysteries | 1|none | 0|acc_norm |↑ |0.5120|± |0.0317|
|
129 |
| - leaderboard_musr_object_placements | 1|none | 0|acc_norm |↑ |0.2500|± |0.0271|
|
130 |
| - leaderboard_musr_team_allocation | 1|none | 0|acc_norm |↑ |0.2680|± |0.0281|
|
|
|
77 |
</div>
|
78 |
</div>
|
79 |
|
80 |
+
## OpenLLM Leaderboard Metrics
|
81 |
|
82 |
| Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
|
83 |
|-----------------------------------------------------------|-------|------|-----:|-----------------------|---|-----:|---|------|
|
84 |
|leaderboard | N/A| | | | | | | |
|
85 |
+
| - leaderboard_bbh | N/A| | | | |0.3290| | |
|
86 |
| - leaderboard_bbh_boolean_expressions | 1|none | 3|acc_norm |↑ |0.6840|± |0.0295|
|
87 |
| - leaderboard_bbh_causal_judgement | 1|none | 3|acc_norm |↑ |0.5134|± |0.0366|
|
88 |
| - leaderboard_bbh_date_understanding | 1|none | 3|acc_norm |↑ |0.1920|± |0.0250|
|
|
|
107 |
| - leaderboard_bbh_tracking_shuffled_objects_seven_objects| 1|none | 3|acc_norm |↑ |0.1400|± |0.0220|
|
108 |
| - leaderboard_bbh_tracking_shuffled_objects_three_objects| 1|none | 3|acc_norm |↑ |0.3520|± |0.0303|
|
109 |
| - leaderboard_bbh_web_of_lies | 1|none | 3|acc_norm |↑ |0.4880|± |0.0317|
|
110 |
+
| - leaderboard_gpqa | N/A| | | | |0.2482| | |
|
111 |
| - leaderboard_gpqa_diamond | 1|none | 0|acc_norm |↑ |0.2576|± |0.0312|
|
112 |
| - leaderboard_gpqa_extended | 1|none | 0|acc_norm |↑ |0.2436|± |0.0184|
|
113 |
| - leaderboard_gpqa_main | 1|none | 0|acc_norm |↑ |0.2433|± |0.0203|
|
|
|
124 |
| - leaderboard_math_prealgebra_hard | 2|none | 4|exact_match |↑ |0.0000|± | 0|
|
125 |
| - leaderboard_math_precalculus_hard | 2|none | 4|exact_match |↑ |0.0000|± | 0|
|
126 |
| - leaderboard_mmlu_pro | 0.1|none | 5|acc |↑ |0.1222|± |0.0030|
|
127 |
+
| - leaderboard_musr | N/A| | |avg acc_norm | |0.3433| | |
|
128 |
| - leaderboard_musr_murder_mysteries | 1|none | 0|acc_norm |↑ |0.5120|± |0.0317|
|
129 |
| - leaderboard_musr_object_placements | 1|none | 0|acc_norm |↑ |0.2500|± |0.0271|
|
130 |
| - leaderboard_musr_team_allocation | 1|none | 0|acc_norm |↑ |0.2680|± |0.0281|
|