Spaces:
Runtime error
Runtime error
self-improving-leaderboard
/
eval-results
/BioMistral
/BioMistral-7B
/BioMistral_BioMistral-7B_result_2024-05-30 01_33_58.json
{ | |
"results": { | |
"daily": { | |
"daily": 9 | |
}, | |
"quarterly": { | |
"quarterly": 9 | |
}, | |
"harness|arc_challenge|25": { | |
"acc": 0.257679180887372, | |
"acc_stderr": 0.012780770562768416, | |
"acc_norm": 0.3122866894197952, | |
"acc_norm_stderr": 0.013542598541688065 | |
}, | |
"harness|hellaswag|10": { | |
"acc": 0.3229436367257518, | |
"acc_stderr": 0.004666457279979418, | |
"acc_norm": 0.39255128460466043, | |
"acc_norm_stderr": 0.004873203269366306 | |
}, | |
"harness|mmlu_world_religions|5": { | |
"acc": 0.34502923976608185, | |
"acc_stderr": 0.036459813773888065, | |
"acc_norm": 0.34502923976608185, | |
"acc_norm_stderr": 0.036459813773888065 | |
}, | |
"harness|mmlu_management|5": { | |
"acc": 0.4368932038834951, | |
"acc_stderr": 0.04911147107365778, | |
"acc_norm": 0.4368932038834951, | |
"acc_norm_stderr": 0.04911147107365778 | |
}, | |
"harness|mmlu_miscellaneous|5": { | |
"acc": 0.3780332056194125, | |
"acc_stderr": 0.017339844462104625, | |
"acc_norm": 0.3780332056194125, | |
"acc_norm_stderr": 0.017339844462104625 | |
}, | |
"harness|mmlu_anatomy|5": { | |
"acc": 0.3037037037037037, | |
"acc_stderr": 0.039725528847851355, | |
"acc_norm": 0.3037037037037037, | |
"acc_norm_stderr": 0.039725528847851355 | |
}, | |
"harness|mmlu_abstract_algebra|5": { | |
"acc": 0.37, | |
"acc_stderr": 0.04852365870939099, | |
"acc_norm": 0.37, | |
"acc_norm_stderr": 0.04852365870939099 | |
}, | |
"harness|mmlu_conceptual_physics|5": { | |
"acc": 0.28085106382978725, | |
"acc_stderr": 0.02937917046412482, | |
"acc_norm": 0.28085106382978725, | |
"acc_norm_stderr": 0.02937917046412482 | |
}, | |
"harness|mmlu_virology|5": { | |
"acc": 0.3373493975903614, | |
"acc_stderr": 0.03680783690727581, | |
"acc_norm": 0.3373493975903614, | |
"acc_norm_stderr": 0.03680783690727581 | |
}, | |
"harness|mmlu_philosophy|5": { | |
"acc": 0.3954983922829582, | |
"acc_stderr": 0.027770918531427838, | |
"acc_norm": 0.3954983922829582, | |
"acc_norm_stderr": 0.027770918531427838 | |
}, | |
"harness|mmlu_human_aging|5": { | |
"acc": 0.34977578475336324, | |
"acc_stderr": 0.03200736719484503, | |
"acc_norm": 0.34977578475336324, | |
"acc_norm_stderr": 0.03200736719484503 | |
}, | |
"harness|mmlu_human_sexuality|5": { | |
"acc": 0.3969465648854962, | |
"acc_stderr": 0.04291135671009224, | |
"acc_norm": 0.3969465648854962, | |
"acc_norm_stderr": 0.04291135671009224 | |
}, | |
"harness|mmlu_medical_genetics|5": { | |
"acc": 0.42, | |
"acc_stderr": 0.049604496374885836, | |
"acc_norm": 0.42, | |
"acc_norm_stderr": 0.049604496374885836 | |
}, | |
"harness|mmlu_high_school_geography|5": { | |
"acc": 0.4292929292929293, | |
"acc_stderr": 0.03526552724601199, | |
"acc_norm": 0.4292929292929293, | |
"acc_norm_stderr": 0.03526552724601199 | |
}, | |
"harness|mmlu_electrical_engineering|5": { | |
"acc": 0.4, | |
"acc_stderr": 0.04082482904638628, | |
"acc_norm": 0.4, | |
"acc_norm_stderr": 0.04082482904638628 | |
}, | |
"harness|mmlu_college_physics|5": { | |
"acc": 0.30392156862745096, | |
"acc_stderr": 0.045766654032077636, | |
"acc_norm": 0.30392156862745096, | |
"acc_norm_stderr": 0.045766654032077636 | |
}, | |
"harness|mmlu_high_school_microeconomics|5": { | |
"acc": 0.40336134453781514, | |
"acc_stderr": 0.031866081214088314, | |
"acc_norm": 0.40336134453781514, | |
"acc_norm_stderr": 0.031866081214088314 | |
}, | |
"harness|mmlu_high_school_macroeconomics|5": { | |
"acc": 0.40512820512820513, | |
"acc_stderr": 0.024890471769938145, | |
"acc_norm": 0.40512820512820513, | |
"acc_norm_stderr": 0.024890471769938145 | |
}, | |
"harness|mmlu_computer_security|5": { | |
"acc": 0.48, | |
"acc_stderr": 0.050211673156867795, | |
"acc_norm": 0.48, | |
"acc_norm_stderr": 0.050211673156867795 | |
}, | |
"harness|mmlu_global_facts|5": { | |
"acc": 0.32, | |
"acc_stderr": 0.04688261722621505, | |
"acc_norm": 0.32, | |
"acc_norm_stderr": 0.04688261722621505 | |
}, | |
"harness|mmlu_jurisprudence|5": { | |
"acc": 0.49074074074074076, | |
"acc_stderr": 0.04832853553437055, | |
"acc_norm": 0.49074074074074076, | |
"acc_norm_stderr": 0.04832853553437055 | |
}, | |
"harness|mmlu_high_school_chemistry|5": { | |
"acc": 0.37438423645320196, | |
"acc_stderr": 0.03405155380561952, | |
"acc_norm": 0.37438423645320196, | |
"acc_norm_stderr": 0.03405155380561952 | |
}, | |
"harness|mmlu_high_school_biology|5": { | |
"acc": 0.36774193548387096, | |
"acc_stderr": 0.027430866579973474, | |
"acc_norm": 0.36774193548387096, | |
"acc_norm_stderr": 0.027430866579973474 | |
}, | |
"harness|mmlu_marketing|5": { | |
"acc": 0.5598290598290598, | |
"acc_stderr": 0.0325207417206305, | |
"acc_norm": 0.5598290598290598, | |
"acc_norm_stderr": 0.0325207417206305 | |
}, | |
"harness|mmlu_clinical_knowledge|5": { | |
"acc": 0.3886792452830189, | |
"acc_stderr": 0.030000485448675986, | |
"acc_norm": 0.3886792452830189, | |
"acc_norm_stderr": 0.030000485448675986 | |
}, | |
"harness|mmlu_public_relations|5": { | |
"acc": 0.44545454545454544, | |
"acc_stderr": 0.047605488214603246, | |
"acc_norm": 0.44545454545454544, | |
"acc_norm_stderr": 0.047605488214603246 | |
}, | |
"harness|mmlu_high_school_mathematics|5": { | |
"acc": 0.34444444444444444, | |
"acc_stderr": 0.028972648884844267, | |
"acc_norm": 0.34444444444444444, | |
"acc_norm_stderr": 0.028972648884844267 | |
}, | |
"harness|mmlu_high_school_physics|5": { | |
"acc": 0.3443708609271523, | |
"acc_stderr": 0.038796870240733264, | |
"acc_norm": 0.3443708609271523, | |
"acc_norm_stderr": 0.038796870240733264 | |
}, | |
"harness|mmlu_sociology|5": { | |
"acc": 0.4577114427860697, | |
"acc_stderr": 0.035228658640995975, | |
"acc_norm": 0.4577114427860697, | |
"acc_norm_stderr": 0.035228658640995975 | |
}, | |
"harness|mmlu_college_medicine|5": { | |
"acc": 0.3815028901734104, | |
"acc_stderr": 0.03703851193099521, | |
"acc_norm": 0.3815028901734104, | |
"acc_norm_stderr": 0.03703851193099521 | |
}, | |
"harness|mmlu_elementary_mathematics|5": { | |
"acc": 0.35714285714285715, | |
"acc_stderr": 0.02467786284133278, | |
"acc_norm": 0.35714285714285715, | |
"acc_norm_stderr": 0.02467786284133278 | |
}, | |
"harness|mmlu_college_biology|5": { | |
"acc": 0.3333333333333333, | |
"acc_stderr": 0.03942082639927213, | |
"acc_norm": 0.3333333333333333, | |
"acc_norm_stderr": 0.03942082639927213 | |
}, | |
"harness|mmlu_college_chemistry|5": { | |
"acc": 0.47, | |
"acc_stderr": 0.05016135580465919, | |
"acc_norm": 0.47, | |
"acc_norm_stderr": 0.05016135580465919 | |
}, | |
"harness|mmlu_us_foreign_policy|5": { | |
"acc": 0.54, | |
"acc_stderr": 0.05009082659620333, | |
"acc_norm": 0.54, | |
"acc_norm_stderr": 0.05009082659620333 | |
}, | |
"harness|mmlu_moral_disputes|5": { | |
"acc": 0.44508670520231214, | |
"acc_stderr": 0.02675625512966377, | |
"acc_norm": 0.44508670520231214, | |
"acc_norm_stderr": 0.02675625512966377 | |
}, | |
"harness|mmlu_logical_fallacies|5": { | |
"acc": 0.34355828220858897, | |
"acc_stderr": 0.03731133519673893, | |
"acc_norm": 0.34355828220858897, | |
"acc_norm_stderr": 0.03731133519673893 | |
}, | |
"harness|mmlu_prehistory|5": { | |
"acc": 0.37037037037037035, | |
"acc_stderr": 0.02686949074481525, | |
"acc_norm": 0.37037037037037035, | |
"acc_norm_stderr": 0.02686949074481525 | |
}, | |
"harness|mmlu_college_mathematics|5": { | |
"acc": 0.33, | |
"acc_stderr": 0.04725815626252605, | |
"acc_norm": 0.33, | |
"acc_norm_stderr": 0.04725815626252605 | |
}, | |
"harness|mmlu_high_school_government_and_politics|5": { | |
"acc": 0.44559585492227977, | |
"acc_stderr": 0.0358701498607566, | |
"acc_norm": 0.44559585492227977, | |
"acc_norm_stderr": 0.0358701498607566 | |
}, | |
"harness|mmlu_econometrics|5": { | |
"acc": 0.2719298245614035, | |
"acc_stderr": 0.041857744240220575, | |
"acc_norm": 0.2719298245614035, | |
"acc_norm_stderr": 0.041857744240220575 | |
}, | |
"harness|mmlu_high_school_psychology|5": { | |
"acc": 0.3798165137614679, | |
"acc_stderr": 0.020808825617866244, | |
"acc_norm": 0.3798165137614679, | |
"acc_norm_stderr": 0.020808825617866244 | |
}, | |
"harness|mmlu_formal_logic|5": { | |
"acc": 0.3492063492063492, | |
"acc_stderr": 0.04263906892795132, | |
"acc_norm": 0.3492063492063492, | |
"acc_norm_stderr": 0.04263906892795132 | |
}, | |
"harness|mmlu_nutrition|5": { | |
"acc": 0.4117647058823529, | |
"acc_stderr": 0.02818059632825929, | |
"acc_norm": 0.4117647058823529, | |
"acc_norm_stderr": 0.02818059632825929 | |
}, | |
"harness|mmlu_business_ethics|5": { | |
"acc": 0.42, | |
"acc_stderr": 0.049604496374885836, | |
"acc_norm": 0.42, | |
"acc_norm_stderr": 0.049604496374885836 | |
}, | |
"harness|mmlu_international_law|5": { | |
"acc": 0.5619834710743802, | |
"acc_stderr": 0.045291468044357915, | |
"acc_norm": 0.5619834710743802, | |
"acc_norm_stderr": 0.045291468044357915 | |
}, | |
"harness|mmlu_astronomy|5": { | |
"acc": 0.34868421052631576, | |
"acc_stderr": 0.038781398887976125, | |
"acc_norm": 0.34868421052631576, | |
"acc_norm_stderr": 0.038781398887976125 | |
}, | |
"harness|mmlu_professional_psychology|5": { | |
"acc": 0.3284313725490196, | |
"acc_stderr": 0.018999707383162666, | |
"acc_norm": 0.3284313725490196, | |
"acc_norm_stderr": 0.018999707383162666 | |
}, | |
"harness|mmlu_professional_accounting|5": { | |
"acc": 0.2730496453900709, | |
"acc_stderr": 0.026577860943307857, | |
"acc_norm": 0.2730496453900709, | |
"acc_norm_stderr": 0.026577860943307857 | |
}, | |
"harness|mmlu_machine_learning|5": { | |
"acc": 0.2767857142857143, | |
"acc_stderr": 0.04246624336697627, | |
"acc_norm": 0.2767857142857143, | |
"acc_norm_stderr": 0.04246624336697627 | |
}, | |
"harness|mmlu_high_school_statistics|5": { | |
"acc": 0.4074074074074074, | |
"acc_stderr": 0.03350991604696043, | |
"acc_norm": 0.4074074074074074, | |
"acc_norm_stderr": 0.03350991604696043 | |
}, | |
"harness|mmlu_moral_scenarios|5": { | |
"acc": 0.23910614525139665, | |
"acc_stderr": 0.014265554192331149, | |
"acc_norm": 0.23910614525139665, | |
"acc_norm_stderr": 0.014265554192331149 | |
}, | |
"harness|mmlu_college_computer_science|5": { | |
"acc": 0.33, | |
"acc_stderr": 0.047258156262526045, | |
"acc_norm": 0.33, | |
"acc_norm_stderr": 0.047258156262526045 | |
}, | |
"harness|mmlu_high_school_computer_science|5": { | |
"acc": 0.4, | |
"acc_stderr": 0.04923659639173309, | |
"acc_norm": 0.4, | |
"acc_norm_stderr": 0.04923659639173309 | |
}, | |
"harness|mmlu_professional_medicine|5": { | |
"acc": 0.4227941176470588, | |
"acc_stderr": 0.030008562845003483, | |
"acc_norm": 0.4227941176470588, | |
"acc_norm_stderr": 0.030008562845003483 | |
}, | |
"harness|mmlu_security_studies|5": { | |
"acc": 0.3469387755102041, | |
"acc_stderr": 0.030472526026726492, | |
"acc_norm": 0.3469387755102041, | |
"acc_norm_stderr": 0.030472526026726492 | |
}, | |
"harness|mmlu_high_school_world_history|5": { | |
"acc": 0.4177215189873418, | |
"acc_stderr": 0.032103530322412685, | |
"acc_norm": 0.4177215189873418, | |
"acc_norm_stderr": 0.032103530322412685 | |
}, | |
"harness|mmlu_professional_law|5": { | |
"acc": 0.3005215123859192, | |
"acc_stderr": 0.011709918883039124, | |
"acc_norm": 0.3005215123859192, | |
"acc_norm_stderr": 0.011709918883039124 | |
}, | |
"harness|mmlu_high_school_us_history|5": { | |
"acc": 0.3872549019607843, | |
"acc_stderr": 0.03418931233833344, | |
"acc_norm": 0.3872549019607843, | |
"acc_norm_stderr": 0.03418931233833344 | |
}, | |
"harness|mmlu_high_school_european_history|5": { | |
"acc": 0.43636363636363634, | |
"acc_stderr": 0.03872592983524753, | |
"acc_norm": 0.43636363636363634, | |
"acc_norm_stderr": 0.03872592983524753 | |
}, | |
"harness|truthfulqa_mc|0": { | |
"mc1": 0.3072215422276622, | |
"mc1_stderr": 0.016150201321323002, | |
"mc2": 0.4721418472000992, | |
"mc2_stderr": 0.01626625866283201 | |
} | |
}, | |
"versions": { | |
"all": 0, | |
"harness|arc_challenge|25": 0, | |
"harness|hellaswag|10": 0, | |
"harness|mmlu_world_religions|5": 1, | |
"harness|mmlu_management|5": 1, | |
"harness|mmlu_miscellaneous|5": 1, | |
"harness|mmlu_anatomy|5": 1, | |
"harness|mmlu_abstract_algebra|5": 1, | |
"harness|mmlu_conceptual_physics|5": 1, | |
"harness|mmlu_virology|5": 1, | |
"harness|mmlu_philosophy|5": 1, | |
"harness|mmlu_human_aging|5": 1, | |
"harness|mmlu_human_sexuality|5": 1, | |
"harness|mmlu_medical_genetics|5": 1, | |
"harness|mmlu_high_school_geography|5": 1, | |
"harness|mmlu_electrical_engineering|5": 1, | |
"harness|mmlu_college_physics|5": 1, | |
"harness|mmlu_high_school_microeconomics|5": 1, | |
"harness|mmlu_high_school_macroeconomics|5": 1, | |
"harness|mmlu_computer_security|5": 1, | |
"harness|mmlu_global_facts|5": 1, | |
"harness|mmlu_jurisprudence|5": 1, | |
"harness|mmlu_high_school_chemistry|5": 1, | |
"harness|mmlu_high_school_biology|5": 1, | |
"harness|mmlu_marketing|5": 1, | |
"harness|mmlu_clinical_knowledge|5": 1, | |
"harness|mmlu_public_relations|5": 1, | |
"harness|mmlu_high_school_mathematics|5": 1, | |
"harness|mmlu_high_school_physics|5": 1, | |
"harness|mmlu_sociology|5": 1, | |
"harness|mmlu_college_medicine|5": 1, | |
"harness|mmlu_elementary_mathematics|5": 1, | |
"harness|mmlu_college_biology|5": 1, | |
"harness|mmlu_college_chemistry|5": 1, | |
"harness|mmlu_us_foreign_policy|5": 1, | |
"harness|mmlu_moral_disputes|5": 1, | |
"harness|mmlu_logical_fallacies|5": 1, | |
"harness|mmlu_prehistory|5": 1, | |
"harness|mmlu_college_mathematics|5": 1, | |
"harness|mmlu_high_school_government_and_politics|5": 1, | |
"harness|mmlu_econometrics|5": 1, | |
"harness|mmlu_high_school_psychology|5": 1, | |
"harness|mmlu_formal_logic|5": 1, | |
"harness|mmlu_nutrition|5": 1, | |
"harness|mmlu_business_ethics|5": 1, | |
"harness|mmlu_international_law|5": 1, | |
"harness|mmlu_astronomy|5": 1, | |
"harness|mmlu_professional_psychology|5": 1, | |
"harness|mmlu_professional_accounting|5": 1, | |
"harness|mmlu_machine_learning|5": 1, | |
"harness|mmlu_high_school_statistics|5": 1, | |
"harness|mmlu_moral_scenarios|5": 1, | |
"harness|mmlu_college_computer_science|5": 1, | |
"harness|mmlu_high_school_computer_science|5": 1, | |
"harness|mmlu_professional_medicine|5": 1, | |
"harness|mmlu_security_studies|5": 1, | |
"harness|mmlu_high_school_world_history|5": 1, | |
"harness|mmlu_professional_law|5": 1, | |
"harness|mmlu_high_school_us_history|5": 1, | |
"harness|mmlu_high_school_european_history|5": 1, | |
"harness|truthfulqa_mc|0": 0 | |
}, | |
"config_general": { | |
"model_name": "BioMistral/BioMistral-7B", | |
"model_sha": "9a11e1ffa817c211cbb52ee1fb312dc6b61b40a5", | |
"model_dtype": "torch.float16", | |
"lighteval_sha": "", | |
"num_few_shot_default": 0, | |
"num_fewshot_seeds": 1, | |
"override_batch_size": 1, | |
"max_samples": null | |
} | |
} |