{ "results": { "daily": { "daily": 6 }, "quarterly": { "quarterly": 6 }, "harness|arc_challenge|25": { "acc": 0.42918088737201365, "acc_stderr": 0.014464085894870651, "acc_norm": 0.46501706484641636, "acc_norm_stderr": 0.014575583922019672 }, "harness|hellaswag|10": { "acc": 0.445628360884286, "acc_stderr": 0.004960191341430244, "acc_norm": 0.589523999203346, "acc_norm_stderr": 0.004909148239488273 }, "harness|mmlu_world_religions|5": { "acc": 0.6432748538011696, "acc_stderr": 0.03674013002860954, "acc_norm": 0.6432748538011696, "acc_norm_stderr": 0.03674013002860954 }, "harness|mmlu_management|5": { "acc": 0.6116504854368932, "acc_stderr": 0.04825729337356389, "acc_norm": 0.6116504854368932, "acc_norm_stderr": 0.04825729337356389 }, "harness|mmlu_miscellaneous|5": { "acc": 0.6155810983397191, "acc_stderr": 0.01739568874281962, "acc_norm": 0.6155810983397191, "acc_norm_stderr": 0.01739568874281962 }, "harness|mmlu_anatomy|5": { "acc": 0.4962962962962963, "acc_stderr": 0.04319223625811331, "acc_norm": 0.4962962962962963, "acc_norm_stderr": 0.04319223625811331 }, "harness|mmlu_abstract_algebra|5": { "acc": 0.26, "acc_stderr": 0.04408440022768077, "acc_norm": 0.26, "acc_norm_stderr": 0.04408440022768077 }, "harness|mmlu_conceptual_physics|5": { "acc": 0.4553191489361702, "acc_stderr": 0.03255525359340354, "acc_norm": 0.4553191489361702, "acc_norm_stderr": 0.03255525359340354 }, "harness|mmlu_virology|5": { "acc": 0.5180722891566265, "acc_stderr": 0.038899512528272166, "acc_norm": 0.5180722891566265, "acc_norm_stderr": 0.038899512528272166 }, "harness|mmlu_philosophy|5": { "acc": 0.5755627009646302, "acc_stderr": 0.028071928247946205, "acc_norm": 0.5755627009646302, "acc_norm_stderr": 0.028071928247946205 }, "harness|mmlu_human_aging|5": { "acc": 0.5650224215246636, "acc_stderr": 0.033272833702713445, "acc_norm": 0.5650224215246636, "acc_norm_stderr": 0.033272833702713445 }, "harness|mmlu_human_sexuality|5": { "acc": 0.5877862595419847, "acc_stderr": 0.04317171194870255, "acc_norm": 0.5877862595419847, "acc_norm_stderr": 0.04317171194870255 }, "harness|mmlu_medical_genetics|5": { "acc": 0.5, "acc_stderr": 0.050251890762960605, "acc_norm": 0.5, "acc_norm_stderr": 0.050251890762960605 }, "harness|mmlu_high_school_geography|5": { "acc": 0.6515151515151515, "acc_stderr": 0.033948539651564025, "acc_norm": 0.6515151515151515, "acc_norm_stderr": 0.033948539651564025 }, "harness|mmlu_electrical_engineering|5": { "acc": 0.503448275862069, "acc_stderr": 0.04166567577101579, "acc_norm": 0.503448275862069, "acc_norm_stderr": 0.04166567577101579 }, "harness|mmlu_college_physics|5": { "acc": 0.2549019607843137, "acc_stderr": 0.043364327079931785, "acc_norm": 0.2549019607843137, "acc_norm_stderr": 0.043364327079931785 }, "harness|mmlu_high_school_microeconomics|5": { "acc": 0.5756302521008403, "acc_stderr": 0.03210479051015776, "acc_norm": 0.5756302521008403, "acc_norm_stderr": 0.03210479051015776 }, "harness|mmlu_high_school_macroeconomics|5": { "acc": 0.541025641025641, "acc_stderr": 0.025265525491284295, "acc_norm": 0.541025641025641, "acc_norm_stderr": 0.025265525491284295 }, "harness|mmlu_computer_security|5": { "acc": 0.54, "acc_stderr": 0.05009082659620332, "acc_norm": 0.54, "acc_norm_stderr": 0.05009082659620332 }, "harness|mmlu_global_facts|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|mmlu_jurisprudence|5": { "acc": 0.5555555555555556, "acc_stderr": 0.04803752235190192, "acc_norm": 0.5555555555555556, "acc_norm_stderr": 0.04803752235190192 }, "harness|mmlu_high_school_chemistry|5": { "acc": 0.3842364532019704, "acc_stderr": 0.0342239856565755, "acc_norm": 0.3842364532019704, "acc_norm_stderr": 0.0342239856565755 }, "harness|mmlu_high_school_biology|5": { "acc": 0.5774193548387097, "acc_stderr": 0.02810096472427264, "acc_norm": 0.5774193548387097, "acc_norm_stderr": 0.02810096472427264 }, "harness|mmlu_marketing|5": { "acc": 0.7777777777777778, "acc_stderr": 0.027236013946196673, "acc_norm": 0.7777777777777778, "acc_norm_stderr": 0.027236013946196673 }, "harness|mmlu_clinical_knowledge|5": { "acc": 0.4981132075471698, "acc_stderr": 0.030772653642075657, "acc_norm": 0.4981132075471698, "acc_norm_stderr": 0.030772653642075657 }, "harness|mmlu_public_relations|5": { "acc": 0.5272727272727272, "acc_stderr": 0.04782001791380061, "acc_norm": 0.5272727272727272, "acc_norm_stderr": 0.04782001791380061 }, "harness|mmlu_high_school_mathematics|5": { "acc": 0.25555555555555554, "acc_stderr": 0.026593939101844082, "acc_norm": 0.25555555555555554, "acc_norm_stderr": 0.026593939101844082 }, "harness|mmlu_high_school_physics|5": { "acc": 0.33774834437086093, "acc_stderr": 0.038615575462551684, "acc_norm": 0.33774834437086093, "acc_norm_stderr": 0.038615575462551684 }, "harness|mmlu_sociology|5": { "acc": 0.7064676616915423, "acc_stderr": 0.032200241045342054, "acc_norm": 0.7064676616915423, "acc_norm_stderr": 0.032200241045342054 }, "harness|mmlu_college_medicine|5": { "acc": 0.4797687861271676, "acc_stderr": 0.03809342081273958, "acc_norm": 0.4797687861271676, "acc_norm_stderr": 0.03809342081273958 }, "harness|mmlu_elementary_mathematics|5": { "acc": 0.38095238095238093, "acc_stderr": 0.025010749116137602, "acc_norm": 0.38095238095238093, "acc_norm_stderr": 0.025010749116137602 }, "harness|mmlu_college_biology|5": { "acc": 0.4236111111111111, "acc_stderr": 0.041321250197233685, "acc_norm": 0.4236111111111111, "acc_norm_stderr": 0.041321250197233685 }, "harness|mmlu_college_chemistry|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|mmlu_us_foreign_policy|5": { "acc": 0.71, "acc_stderr": 0.04560480215720683, "acc_norm": 0.71, "acc_norm_stderr": 0.04560480215720683 }, "harness|mmlu_moral_disputes|5": { "acc": 0.5751445086705202, "acc_stderr": 0.026613350840261733, "acc_norm": 0.5751445086705202, "acc_norm_stderr": 0.026613350840261733 }, "harness|mmlu_logical_fallacies|5": { "acc": 0.5030674846625767, "acc_stderr": 0.03928297078179662, "acc_norm": 0.5030674846625767, "acc_norm_stderr": 0.03928297078179662 }, "harness|mmlu_prehistory|5": { "acc": 0.5370370370370371, "acc_stderr": 0.027744313443376536, "acc_norm": 0.5370370370370371, "acc_norm_stderr": 0.027744313443376536 }, "harness|mmlu_college_mathematics|5": { "acc": 0.33, "acc_stderr": 0.04725815626252606, "acc_norm": 0.33, "acc_norm_stderr": 0.04725815626252606 }, "harness|mmlu_high_school_government_and_politics|5": { "acc": 0.6217616580310881, "acc_stderr": 0.034998072761933376, "acc_norm": 0.6217616580310881, "acc_norm_stderr": 0.034998072761933376 }, "harness|mmlu_econometrics|5": { "acc": 0.37719298245614036, "acc_stderr": 0.04559522141958216, "acc_norm": 0.37719298245614036, "acc_norm_stderr": 0.04559522141958216 }, "harness|mmlu_high_school_psychology|5": { "acc": 0.6385321100917432, "acc_stderr": 0.02059808200993736, "acc_norm": 0.6385321100917432, "acc_norm_stderr": 0.02059808200993736 }, "harness|mmlu_formal_logic|5": { "acc": 0.4126984126984127, "acc_stderr": 0.04403438954768177, "acc_norm": 0.4126984126984127, "acc_norm_stderr": 0.04403438954768177 }, "harness|mmlu_nutrition|5": { "acc": 0.5261437908496732, "acc_stderr": 0.028590752958852387, "acc_norm": 0.5261437908496732, "acc_norm_stderr": 0.028590752958852387 }, "harness|mmlu_business_ethics|5": { "acc": 0.57, "acc_stderr": 0.049756985195624284, "acc_norm": 0.57, "acc_norm_stderr": 0.049756985195624284 }, "harness|mmlu_international_law|5": { "acc": 0.7520661157024794, "acc_stderr": 0.03941897526516304, "acc_norm": 0.7520661157024794, "acc_norm_stderr": 0.03941897526516304 }, "harness|mmlu_astronomy|5": { "acc": 0.5789473684210527, "acc_stderr": 0.040179012759817494, "acc_norm": 0.5789473684210527, "acc_norm_stderr": 0.040179012759817494 }, "harness|mmlu_professional_psychology|5": { "acc": 0.4738562091503268, "acc_stderr": 0.020200164564804588, "acc_norm": 0.4738562091503268, "acc_norm_stderr": 0.020200164564804588 }, "harness|mmlu_professional_accounting|5": { "acc": 0.3404255319148936, "acc_stderr": 0.02826765748265013, "acc_norm": 0.3404255319148936, "acc_norm_stderr": 0.02826765748265013 }, "harness|mmlu_machine_learning|5": { "acc": 0.38392857142857145, "acc_stderr": 0.046161430750285455, "acc_norm": 0.38392857142857145, "acc_norm_stderr": 0.046161430750285455 }, "harness|mmlu_high_school_statistics|5": { "acc": 0.4675925925925926, "acc_stderr": 0.03402801581358966, "acc_norm": 0.4675925925925926, "acc_norm_stderr": 0.03402801581358966 }, "harness|mmlu_moral_scenarios|5": { "acc": 0.21675977653631284, "acc_stderr": 0.013780598486443363, "acc_norm": 0.21675977653631284, "acc_norm_stderr": 0.013780598486443363 }, "harness|mmlu_college_computer_science|5": { "acc": 0.39, "acc_stderr": 0.04902071300001975, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001975 }, "harness|mmlu_high_school_computer_science|5": { "acc": 0.71, "acc_stderr": 0.04560480215720684, "acc_norm": 0.71, "acc_norm_stderr": 0.04560480215720684 }, "harness|mmlu_professional_medicine|5": { "acc": 0.4411764705882353, "acc_stderr": 0.0301619119307671, "acc_norm": 0.4411764705882353, "acc_norm_stderr": 0.0301619119307671 }, "harness|mmlu_security_studies|5": { "acc": 0.6285714285714286, "acc_stderr": 0.03093285879278986, "acc_norm": 0.6285714285714286, "acc_norm_stderr": 0.03093285879278986 }, "harness|mmlu_high_school_world_history|5": { "acc": 0.70042194092827, "acc_stderr": 0.029818024749753095, "acc_norm": 0.70042194092827, "acc_norm_stderr": 0.029818024749753095 }, "harness|mmlu_professional_law|5": { "acc": 0.378748370273794, "acc_stderr": 0.012389052105003741, "acc_norm": 0.378748370273794, "acc_norm_stderr": 0.012389052105003741 }, "harness|mmlu_high_school_us_history|5": { "acc": 0.6225490196078431, "acc_stderr": 0.03402272044340703, "acc_norm": 0.6225490196078431, "acc_norm_stderr": 0.03402272044340703 }, "harness|mmlu_high_school_european_history|5": { "acc": 0.6666666666666666, "acc_stderr": 0.03681050869161549, "acc_norm": 0.6666666666666666, "acc_norm_stderr": 0.03681050869161549 }, "harness|truthfulqa_mc|0": { "mc1": 0.33659730722154224, "mc1_stderr": 0.016542412809494877, "mc2": 0.49995145184296846, "mc2_stderr": 0.015887726098900913 } }, "versions": { "all": 0, "harness|arc_challenge|25": 0, "harness|hellaswag|10": 0, "harness|mmlu_world_religions|5": 1, "harness|mmlu_management|5": 1, "harness|mmlu_miscellaneous|5": 1, "harness|mmlu_anatomy|5": 1, "harness|mmlu_abstract_algebra|5": 1, "harness|mmlu_conceptual_physics|5": 1, "harness|mmlu_virology|5": 1, "harness|mmlu_philosophy|5": 1, "harness|mmlu_human_aging|5": 1, "harness|mmlu_human_sexuality|5": 1, "harness|mmlu_medical_genetics|5": 1, "harness|mmlu_high_school_geography|5": 1, "harness|mmlu_electrical_engineering|5": 1, "harness|mmlu_college_physics|5": 1, "harness|mmlu_high_school_microeconomics|5": 1, "harness|mmlu_high_school_macroeconomics|5": 1, "harness|mmlu_computer_security|5": 1, "harness|mmlu_global_facts|5": 1, "harness|mmlu_jurisprudence|5": 1, "harness|mmlu_high_school_chemistry|5": 1, "harness|mmlu_high_school_biology|5": 1, "harness|mmlu_marketing|5": 1, "harness|mmlu_clinical_knowledge|5": 1, "harness|mmlu_public_relations|5": 1, "harness|mmlu_high_school_mathematics|5": 1, "harness|mmlu_high_school_physics|5": 1, "harness|mmlu_sociology|5": 1, "harness|mmlu_college_medicine|5": 1, "harness|mmlu_elementary_mathematics|5": 1, "harness|mmlu_college_biology|5": 1, "harness|mmlu_college_chemistry|5": 1, "harness|mmlu_us_foreign_policy|5": 1, "harness|mmlu_moral_disputes|5": 1, "harness|mmlu_logical_fallacies|5": 1, "harness|mmlu_prehistory|5": 1, "harness|mmlu_college_mathematics|5": 1, "harness|mmlu_high_school_government_and_politics|5": 1, "harness|mmlu_econometrics|5": 1, "harness|mmlu_high_school_psychology|5": 1, "harness|mmlu_formal_logic|5": 1, "harness|mmlu_nutrition|5": 1, "harness|mmlu_business_ethics|5": 1, "harness|mmlu_international_law|5": 1, "harness|mmlu_astronomy|5": 1, "harness|mmlu_professional_psychology|5": 1, "harness|mmlu_professional_accounting|5": 1, "harness|mmlu_machine_learning|5": 1, "harness|mmlu_high_school_statistics|5": 1, "harness|mmlu_moral_scenarios|5": 1, "harness|mmlu_college_computer_science|5": 1, "harness|mmlu_high_school_computer_science|5": 1, "harness|mmlu_professional_medicine|5": 1, "harness|mmlu_security_studies|5": 1, "harness|mmlu_high_school_world_history|5": 1, "harness|mmlu_professional_law|5": 1, "harness|mmlu_high_school_us_history|5": 1, "harness|mmlu_high_school_european_history|5": 1, "harness|truthfulqa_mc|0": 0 }, "config_general": { "model_name": "nlpai-lab/KULLM3", "model_sha": "5a6bcd0fc7f240460eb6d57016f7b4060bc1f43b", "model_dtype": "torch.float16", "lighteval_sha": "", "num_few_shot_default": 0, "num_fewshot_seeds": 1, "override_batch_size": 1, "max_samples": null } }