{ "config_general": { "model_name": "TinyPixel/lima-test", "model_sha": "4d6a006c6341f29b11c02f19bf9535f51b4da1b5", "model_dtype": "torch.bfloat16", "lighteval_sha": "c8a907ca0dbabbcc3132b1b9d84d5c763d587820", "num_few_shot_default": 0, "num_fewshot_seeds": 1, "override_batch_size": 1, "max_samples": null, "job_id": "" }, "results": { "harness|arc:challenge|25": { "acc": 0.49146757679180886, "acc_stderr": 0.01460926316563219, "acc_norm": 0.5307167235494881, "acc_norm_stderr": 0.014583792546304037 }, "harness|hellaswag|10": { "acc": 0.5900219079864569, "acc_stderr": 0.004908241354310212, "acc_norm": 0.7887870942043418, "acc_norm_stderr": 0.004073349176133355 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.45925925925925926, "acc_stderr": 0.04304979692464242, "acc_norm": 0.45925925925925926, "acc_norm_stderr": 0.04304979692464242 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.3881578947368421, "acc_stderr": 0.03965842097512744, "acc_norm": 0.3881578947368421, "acc_norm_stderr": 0.03965842097512744 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.51, "acc_stderr": 0.05024183937956912, "acc_norm": 0.51, "acc_norm_stderr": 0.05024183937956912 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.46037735849056605, "acc_stderr": 0.030676096599389184, "acc_norm": 0.46037735849056605, "acc_norm_stderr": 0.030676096599389184 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.4513888888888889, "acc_stderr": 0.04161402398403279, "acc_norm": 0.4513888888888889, "acc_norm_stderr": 0.04161402398403279 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.36, "acc_stderr": 0.048241815132442176, "acc_norm": 0.36, "acc_norm_stderr": 0.048241815132442176 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.32, "acc_stderr": 0.04688261722621505, "acc_norm": 0.32, "acc_norm_stderr": 0.04688261722621505 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.42196531791907516, "acc_stderr": 0.037657466938651504, "acc_norm": 0.42196531791907516, "acc_norm_stderr": 0.037657466938651504 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.23529411764705882, "acc_stderr": 0.042207736591714534, "acc_norm": 0.23529411764705882, "acc_norm_stderr": 0.042207736591714534 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.61, "acc_stderr": 0.04902071300001975, "acc_norm": 0.61, "acc_norm_stderr": 0.04902071300001975 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.43829787234042555, "acc_stderr": 0.032436186361081004, "acc_norm": 0.43829787234042555, "acc_norm_stderr": 0.032436186361081004 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.2982456140350877, "acc_stderr": 0.04303684033537315, "acc_norm": 0.2982456140350877, "acc_norm_stderr": 0.04303684033537315 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.47586206896551725, "acc_stderr": 0.041618085035015295, "acc_norm": 0.47586206896551725, "acc_norm_stderr": 0.041618085035015295 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.25925925925925924, "acc_stderr": 0.022569897074918407, "acc_norm": 0.25925925925925924, "acc_norm_stderr": 0.022569897074918407 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.30952380952380953, "acc_stderr": 0.04134913018303316, "acc_norm": 0.30952380952380953, "acc_norm_stderr": 0.04134913018303316 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.4967741935483871, "acc_stderr": 0.02844341422643833, "acc_norm": 0.4967741935483871, "acc_norm_stderr": 0.02844341422643833 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.3399014778325123, "acc_stderr": 0.0333276906841079, "acc_norm": 0.3399014778325123, "acc_norm_stderr": 0.0333276906841079 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.39, "acc_stderr": 0.04902071300001974, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001974 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.6363636363636364, "acc_stderr": 0.03756335775187898, "acc_norm": 0.6363636363636364, "acc_norm_stderr": 0.03756335775187898 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.47474747474747475, "acc_stderr": 0.03557806245087314, "acc_norm": 0.47474747474747475, "acc_norm_stderr": 0.03557806245087314 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.6787564766839378, "acc_stderr": 0.033699508685490674, "acc_norm": 0.6787564766839378, "acc_norm_stderr": 0.033699508685490674 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.44871794871794873, "acc_stderr": 0.025217315184846482, "acc_norm": 0.44871794871794873, "acc_norm_stderr": 0.025217315184846482 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.28888888888888886, "acc_stderr": 0.02763490726417854, "acc_norm": 0.28888888888888886, "acc_norm_stderr": 0.02763490726417854 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.42436974789915966, "acc_stderr": 0.03210479051015776, "acc_norm": 0.42436974789915966, "acc_norm_stderr": 0.03210479051015776 }, "harness|hendrycksTest-high_school_physics|5": { "acc": 0.2980132450331126, "acc_stderr": 0.03734535676787198, "acc_norm": 0.2980132450331126, "acc_norm_stderr": 0.03734535676787198 }, "harness|hendrycksTest-high_school_psychology|5": { "acc": 0.6275229357798165, "acc_stderr": 0.020728368457638497, "acc_norm": 0.6275229357798165, "acc_norm_stderr": 0.020728368457638497 }, "harness|hendrycksTest-high_school_statistics|5": { "acc": 0.2777777777777778, "acc_stderr": 0.03054674526495318, "acc_norm": 0.2777777777777778, "acc_norm_stderr": 0.03054674526495318 }, "harness|hendrycksTest-high_school_us_history|5": { "acc": 0.5294117647058824, "acc_stderr": 0.03503235296367992, "acc_norm": 0.5294117647058824, "acc_norm_stderr": 0.03503235296367992 }, "harness|hendrycksTest-high_school_world_history|5": { "acc": 0.5949367088607594, "acc_stderr": 0.03195514741370671, "acc_norm": 0.5949367088607594, "acc_norm_stderr": 0.03195514741370671 }, "harness|hendrycksTest-human_aging|5": { "acc": 0.5560538116591929, "acc_stderr": 0.03334625674242728, "acc_norm": 0.5560538116591929, "acc_norm_stderr": 0.03334625674242728 }, "harness|hendrycksTest-human_sexuality|5": { "acc": 0.5725190839694656, "acc_stderr": 0.04338920305792401, "acc_norm": 0.5725190839694656, "acc_norm_stderr": 0.04338920305792401 }, "harness|hendrycksTest-international_law|5": { "acc": 0.628099173553719, "acc_stderr": 0.04412015806624504, "acc_norm": 0.628099173553719, "acc_norm_stderr": 0.04412015806624504 }, "harness|hendrycksTest-jurisprudence|5": { "acc": 0.5370370370370371, "acc_stderr": 0.04820403072760628, "acc_norm": 0.5370370370370371, "acc_norm_stderr": 0.04820403072760628 }, "harness|hendrycksTest-logical_fallacies|5": { "acc": 0.5153374233128835, "acc_stderr": 0.039265223787088445, "acc_norm": 0.5153374233128835, "acc_norm_stderr": 0.039265223787088445 }, "harness|hendrycksTest-machine_learning|5": { "acc": 0.35714285714285715, "acc_stderr": 0.04547960999764376, "acc_norm": 0.35714285714285715, "acc_norm_stderr": 0.04547960999764376 }, "harness|hendrycksTest-management|5": { "acc": 0.5631067961165048, "acc_stderr": 0.049111471073657764, "acc_norm": 0.5631067961165048, "acc_norm_stderr": 0.049111471073657764 }, "harness|hendrycksTest-marketing|5": { "acc": 0.6965811965811965, "acc_stderr": 0.030118210106942638, "acc_norm": 0.6965811965811965, "acc_norm_stderr": 0.030118210106942638 }, "harness|hendrycksTest-medical_genetics|5": { "acc": 0.52, "acc_stderr": 0.050211673156867795, "acc_norm": 0.52, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-miscellaneous|5": { "acc": 0.6462324393358876, "acc_stderr": 0.017098184708161903, "acc_norm": 0.6462324393358876, "acc_norm_stderr": 0.017098184708161903 }, "harness|hendrycksTest-moral_disputes|5": { "acc": 0.49421965317919075, "acc_stderr": 0.026917296179149116, "acc_norm": 0.49421965317919075, "acc_norm_stderr": 0.026917296179149116 }, "harness|hendrycksTest-moral_scenarios|5": { "acc": 0.23798882681564246, "acc_stderr": 0.014242630070574915, "acc_norm": 0.23798882681564246, "acc_norm_stderr": 0.014242630070574915 }, "harness|hendrycksTest-nutrition|5": { "acc": 0.4869281045751634, "acc_stderr": 0.028620130800700246, "acc_norm": 0.4869281045751634, "acc_norm_stderr": 0.028620130800700246 }, "harness|hendrycksTest-philosophy|5": { "acc": 0.6012861736334405, "acc_stderr": 0.0278093225857745, "acc_norm": 0.6012861736334405, "acc_norm_stderr": 0.0278093225857745 }, "harness|hendrycksTest-prehistory|5": { "acc": 0.5092592592592593, "acc_stderr": 0.027815973433878014, "acc_norm": 0.5092592592592593, "acc_norm_stderr": 0.027815973433878014 }, "harness|hendrycksTest-professional_accounting|5": { "acc": 0.35815602836879434, "acc_stderr": 0.028602085862759422, "acc_norm": 0.35815602836879434, "acc_norm_stderr": 0.028602085862759422 }, "harness|hendrycksTest-professional_law|5": { "acc": 0.36766623207301175, "acc_stderr": 0.012314845910071695, "acc_norm": 0.36766623207301175, "acc_norm_stderr": 0.012314845910071695 }, "harness|hendrycksTest-professional_medicine|5": { "acc": 0.5220588235294118, "acc_stderr": 0.030343264224213535, "acc_norm": 0.5220588235294118, "acc_norm_stderr": 0.030343264224213535 }, "harness|hendrycksTest-professional_psychology|5": { "acc": 0.4542483660130719, "acc_stderr": 0.02014297455379519, "acc_norm": 0.4542483660130719, "acc_norm_stderr": 0.02014297455379519 }, "harness|hendrycksTest-public_relations|5": { "acc": 0.5636363636363636, "acc_stderr": 0.04750185058907296, "acc_norm": 0.5636363636363636, "acc_norm_stderr": 0.04750185058907296 }, "harness|hendrycksTest-security_studies|5": { "acc": 0.45714285714285713, "acc_stderr": 0.031891418324213966, "acc_norm": 0.45714285714285713, "acc_norm_stderr": 0.031891418324213966 }, "harness|hendrycksTest-sociology|5": { "acc": 0.6318407960199005, "acc_stderr": 0.03410410565495301, "acc_norm": 0.6318407960199005, "acc_norm_stderr": 0.03410410565495301 }, "harness|hendrycksTest-us_foreign_policy|5": { "acc": 0.64, "acc_stderr": 0.04824181513244218, "acc_norm": 0.64, "acc_norm_stderr": 0.04824181513244218 }, "harness|hendrycksTest-virology|5": { "acc": 0.4036144578313253, "acc_stderr": 0.038194861407583984, "acc_norm": 0.4036144578313253, "acc_norm_stderr": 0.038194861407583984 }, "harness|hendrycksTest-world_religions|5": { "acc": 0.7134502923976608, "acc_stderr": 0.03467826685703826, "acc_norm": 0.7134502923976608, "acc_norm_stderr": 0.03467826685703826 }, "harness|truthfulqa:mc|0": { "mc1": 0.2533659730722154, "mc1_stderr": 0.015225899340826842, "mc2": 0.3939743360593484, "mc2_stderr": 0.013599478672319854 }, "all": { "acc": 0.4667611741672404, "acc_stderr": 0.03523325461503941, "acc_norm": 0.47079531540411435, "acc_norm_stderr": 0.035218672194742714, "mc1": 0.2533659730722154, "mc1_stderr": 0.015225899340826842, "mc2": 0.3939743360593484, "mc2_stderr": 0.013599478672319854 } }, "versions": { "harness|arc:challenge|25": 0, "harness|hellaswag|10": 0, "harness|hendrycksTest-abstract_algebra|5": 1, "harness|hendrycksTest-anatomy|5": 1, "harness|hendrycksTest-astronomy|5": 1, "harness|hendrycksTest-business_ethics|5": 1, "harness|hendrycksTest-clinical_knowledge|5": 1, "harness|hendrycksTest-college_biology|5": 1, "harness|hendrycksTest-college_chemistry|5": 1, "harness|hendrycksTest-college_computer_science|5": 1, "harness|hendrycksTest-college_mathematics|5": 1, "harness|hendrycksTest-college_medicine|5": 1, "harness|hendrycksTest-college_physics|5": 1, "harness|hendrycksTest-computer_security|5": 1, "harness|hendrycksTest-conceptual_physics|5": 1, "harness|hendrycksTest-econometrics|5": 1, "harness|hendrycksTest-electrical_engineering|5": 1, "harness|hendrycksTest-elementary_mathematics|5": 1, "harness|hendrycksTest-formal_logic|5": 1, "harness|hendrycksTest-global_facts|5": 1, "harness|hendrycksTest-high_school_biology|5": 1, "harness|hendrycksTest-high_school_chemistry|5": 1, "harness|hendrycksTest-high_school_computer_science|5": 1, "harness|hendrycksTest-high_school_european_history|5": 1, "harness|hendrycksTest-high_school_geography|5": 1, "harness|hendrycksTest-high_school_government_and_politics|5": 1, "harness|hendrycksTest-high_school_macroeconomics|5": 1, "harness|hendrycksTest-high_school_mathematics|5": 1, "harness|hendrycksTest-high_school_microeconomics|5": 1, "harness|hendrycksTest-high_school_physics|5": 1, "harness|hendrycksTest-high_school_psychology|5": 1, "harness|hendrycksTest-high_school_statistics|5": 1, "harness|hendrycksTest-high_school_us_history|5": 1, "harness|hendrycksTest-high_school_world_history|5": 1, "harness|hendrycksTest-human_aging|5": 1, "harness|hendrycksTest-human_sexuality|5": 1, "harness|hendrycksTest-international_law|5": 1, "harness|hendrycksTest-jurisprudence|5": 1, "harness|hendrycksTest-logical_fallacies|5": 1, "harness|hendrycksTest-machine_learning|5": 1, "harness|hendrycksTest-management|5": 1, "harness|hendrycksTest-marketing|5": 1, "harness|hendrycksTest-medical_genetics|5": 1, "harness|hendrycksTest-miscellaneous|5": 1, "harness|hendrycksTest-moral_disputes|5": 1, "harness|hendrycksTest-moral_scenarios|5": 1, "harness|hendrycksTest-nutrition|5": 1, "harness|hendrycksTest-philosophy|5": 1, "harness|hendrycksTest-prehistory|5": 1, "harness|hendrycksTest-professional_accounting|5": 1, "harness|hendrycksTest-professional_law|5": 1, "harness|hendrycksTest-professional_medicine|5": 1, "harness|hendrycksTest-professional_psychology|5": 1, "harness|hendrycksTest-public_relations|5": 1, "harness|hendrycksTest-security_studies|5": 1, "harness|hendrycksTest-sociology|5": 1, "harness|hendrycksTest-us_foreign_policy|5": 1, "harness|hendrycksTest-virology|5": 1, "harness|hendrycksTest-world_religions|5": 1, "harness|truthfulqa:mc|0": 1, "all": 0 }, "config_tasks": { "harness|arc:challenge": "LM Harness task", "harness|hellaswag": "LM Harness task", "harness|hendrycksTest-abstract_algebra": "LM Harness task", "harness|hendrycksTest-anatomy": "LM Harness task", "harness|hendrycksTest-astronomy": "LM Harness task", "harness|hendrycksTest-business_ethics": "LM Harness task", "harness|hendrycksTest-clinical_knowledge": "LM Harness task", "harness|hendrycksTest-college_biology": "LM Harness task", "harness|hendrycksTest-college_chemistry": "LM Harness task", "harness|hendrycksTest-college_computer_science": "LM Harness task", "harness|hendrycksTest-college_mathematics": "LM Harness task", "harness|hendrycksTest-college_medicine": "LM Harness task", "harness|hendrycksTest-college_physics": "LM Harness task", "harness|hendrycksTest-computer_security": "LM Harness task", "harness|hendrycksTest-conceptual_physics": "LM Harness task", "harness|hendrycksTest-econometrics": "LM Harness task", "harness|hendrycksTest-electrical_engineering": "LM Harness task", "harness|hendrycksTest-elementary_mathematics": "LM Harness task", "harness|hendrycksTest-formal_logic": "LM Harness task", "harness|hendrycksTest-global_facts": "LM Harness task", "harness|hendrycksTest-high_school_biology": "LM Harness task", "harness|hendrycksTest-high_school_chemistry": "LM Harness task", "harness|hendrycksTest-high_school_computer_science": "LM Harness task", "harness|hendrycksTest-high_school_european_history": "LM Harness task", "harness|hendrycksTest-high_school_geography": "LM Harness task", "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", "harness|hendrycksTest-high_school_mathematics": "LM Harness task", "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", "harness|hendrycksTest-high_school_physics": "LM Harness task", "harness|hendrycksTest-high_school_psychology": "LM Harness task", "harness|hendrycksTest-high_school_statistics": "LM Harness task", "harness|hendrycksTest-high_school_us_history": "LM Harness task", "harness|hendrycksTest-high_school_world_history": "LM Harness task", "harness|hendrycksTest-human_aging": "LM Harness task", "harness|hendrycksTest-human_sexuality": "LM Harness task", "harness|hendrycksTest-international_law": "LM Harness task", "harness|hendrycksTest-jurisprudence": "LM Harness task", "harness|hendrycksTest-logical_fallacies": "LM Harness task", "harness|hendrycksTest-machine_learning": "LM Harness task", "harness|hendrycksTest-management": "LM Harness task", "harness|hendrycksTest-marketing": "LM Harness task", "harness|hendrycksTest-medical_genetics": "LM Harness task", "harness|hendrycksTest-miscellaneous": "LM Harness task", "harness|hendrycksTest-moral_disputes": "LM Harness task", "harness|hendrycksTest-moral_scenarios": "LM Harness task", "harness|hendrycksTest-nutrition": "LM Harness task", "harness|hendrycksTest-philosophy": "LM Harness task", "harness|hendrycksTest-prehistory": "LM Harness task", "harness|hendrycksTest-professional_accounting": "LM Harness task", "harness|hendrycksTest-professional_law": "LM Harness task", "harness|hendrycksTest-professional_medicine": "LM Harness task", "harness|hendrycksTest-professional_psychology": "LM Harness task", "harness|hendrycksTest-public_relations": "LM Harness task", "harness|hendrycksTest-security_studies": "LM Harness task", "harness|hendrycksTest-sociology": "LM Harness task", "harness|hendrycksTest-us_foreign_policy": "LM Harness task", "harness|hendrycksTest-virology": "LM Harness task", "harness|hendrycksTest-world_religions": "LM Harness task", "harness|truthfulqa:mc": "LM Harness task" }, "summary_tasks": { "harness|arc:challenge|25": { "hashes": { "hash_examples": "17b0cae357c0259e", "hash_full_prompts": "045cbb916e5145c6", "hash_input_tokens": "61571bf68d6d89aa", "hash_cont_tokens": "ede2b335438f08e9" }, "truncated": 0, "non-truncated": 4687, "padded": 4687, "non-padded": 0, "effective_few_shots": 25.0, "num_truncated_few_shots": 0 }, "harness|hellaswag|10": { "hashes": { "hash_examples": "e1768ecb99d7ecf0", "hash_full_prompts": "0b4c16983130f84f", "hash_input_tokens": "29906669b1c7054a", "hash_cont_tokens": "b41cf1ad182d68d5" }, "truncated": 0, "non-truncated": 40168, "padded": 40113, "non-padded": 55, "effective_few_shots": 10.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-abstract_algebra|5": { "hashes": { "hash_examples": "280f9f325b40559a", "hash_full_prompts": "2f776a367d23aea2", "hash_input_tokens": "c54ff61ad0273dd7", "hash_cont_tokens": "50421e30bef398f9" }, "truncated": 0, "non-truncated": 400, "padded": 400, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-anatomy|5": { "hashes": { "hash_examples": "2f83a4f1cab4ba18", "hash_full_prompts": "516f74bef25df620", "hash_input_tokens": "be31a1e22aef5f90", "hash_cont_tokens": "f11971a765cb609f" }, "truncated": 0, "non-truncated": 540, "padded": 540, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-astronomy|5": { "hashes": { "hash_examples": "7d587b908da4d762", "hash_full_prompts": "faf4e80f65de93ca", "hash_input_tokens": "277a7b1fad566940", "hash_cont_tokens": "238bd86950544b29" }, "truncated": 0, "non-truncated": 608, "padded": 608, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-business_ethics|5": { "hashes": { "hash_examples": "33e51740670de686", "hash_full_prompts": "db01c3ef8e1479d4", "hash_input_tokens": "ba552605bc116de5", "hash_cont_tokens": "f9d6d2a7d7e9a041" }, "truncated": 0, "non-truncated": 400, "padded": 400, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-clinical_knowledge|5": { "hashes": { "hash_examples": "f3366dbe7eefffa4", "hash_full_prompts": "49654f71d94b65c3", "hash_input_tokens": "428c7563d0b98ab9", "hash_cont_tokens": "6af58623d0d5fbcd" }, "truncated": 0, "non-truncated": 1060, "padded": 1060, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-college_biology|5": { "hashes": { "hash_examples": "ca2b6753a0193e7f", "hash_full_prompts": "2b460b75f1fdfefd", "hash_input_tokens": "da036601573942e2", "hash_cont_tokens": "875cde3af7a0ee14" }, "truncated": 0, "non-truncated": 576, "padded": 576, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-college_chemistry|5": { "hashes": { "hash_examples": "22ff85f1d34f42d1", "hash_full_prompts": "242c9be6da583e95", "hash_input_tokens": "94e0196d6aded13d", "hash_cont_tokens": "50421e30bef398f9" }, "truncated": 0, "non-truncated": 400, "padded": 400, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-college_computer_science|5": { "hashes": { "hash_examples": "30318289d717a5cf", "hash_full_prompts": "ed2bdb4e87c4b371", "hash_input_tokens": "6e4d0f4a8d36690b", "hash_cont_tokens": "1ba0c71186b1505e" }, "truncated": 0, "non-truncated": 400, "padded": 400, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-college_mathematics|5": { "hashes": { "hash_examples": "4944d1f0b6b5d911", "hash_full_prompts": "770bc4281c973190", "hash_input_tokens": "614054d17109a25d", "hash_cont_tokens": "50421e30bef398f9" }, "truncated": 0, "non-truncated": 400, "padded": 400, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-college_medicine|5": { "hashes": { "hash_examples": "dd69cc33381275af", "hash_full_prompts": "ad2a53e5250ab46e", "hash_input_tokens": "1d633b3cc0524ba8", "hash_cont_tokens": "702fb6d82ff0d6ac" }, "truncated": 0, "non-truncated": 692, "padded": 692, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-college_physics|5": { "hashes": { "hash_examples": "875dd26d22655b0d", "hash_full_prompts": "833a0d7b55aed500", "hash_input_tokens": "5421d9a1af86cbd4", "hash_cont_tokens": "f7b8097afc16a47c" }, "truncated": 0, "non-truncated": 408, "padded": 408, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-computer_security|5": { "hashes": { "hash_examples": "006451eedc0ededb", "hash_full_prompts": "94034c97e85d8f46", "hash_input_tokens": "5e6b70ecb333cf18", "hash_cont_tokens": "50421e30bef398f9" }, "truncated": 0, "non-truncated": 400, "padded": 400, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-conceptual_physics|5": { "hashes": { "hash_examples": "8874ece872d2ca4c", "hash_full_prompts": "e40d15a34640d6fa", "hash_input_tokens": "c2ef11a87264ceed", "hash_cont_tokens": "aa0e8bc655f2f641" }, "truncated": 0, "non-truncated": 940, "padded": 940, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-econometrics|5": { "hashes": { "hash_examples": "64d3623b0bfaa43f", "hash_full_prompts": "612f340fae41338d", "hash_input_tokens": "ecaccd912a4c3978", "hash_cont_tokens": "a9b1f761089f6acc" }, "truncated": 0, "non-truncated": 456, "padded": 456, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-electrical_engineering|5": { "hashes": { "hash_examples": "e98f51780c674d7e", "hash_full_prompts": "10275b312d812ae6", "hash_input_tokens": "1590c84291399be8", "hash_cont_tokens": "2425a3f084a591ef" }, "truncated": 0, "non-truncated": 580, "padded": 580, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-elementary_mathematics|5": { "hashes": { "hash_examples": "fc48208a5ac1c0ce", "hash_full_prompts": "5ec274c6c82aca23", "hash_input_tokens": "3269597f715b0da1", "hash_cont_tokens": "eb2d5002052b5bc5" }, "truncated": 0, "non-truncated": 1512, "padded": 1512, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-formal_logic|5": { "hashes": { "hash_examples": "5a6525665f63ea72", "hash_full_prompts": "07b92638c4a6b500", "hash_input_tokens": "a2800d20f3ab8d7c", "hash_cont_tokens": "9b30dc19c9b62f60" }, "truncated": 0, "non-truncated": 504, "padded": 504, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-global_facts|5": { "hashes": { "hash_examples": "371d70d743b2b89b", "hash_full_prompts": "332fdee50a1921b4", "hash_input_tokens": "94ed44b3772505ad", "hash_cont_tokens": "50421e30bef398f9" }, "truncated": 0, "non-truncated": 400, "padded": 400, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-high_school_biology|5": { "hashes": { "hash_examples": "a79e1018b1674052", "hash_full_prompts": "e624e26ede922561", "hash_input_tokens": "24423acb928db768", "hash_cont_tokens": "74217a4e2868536f" }, "truncated": 0, "non-truncated": 1240, "padded": 1240, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-high_school_chemistry|5": { "hashes": { "hash_examples": "44bfc25c389f0e03", "hash_full_prompts": "0e3e5f5d9246482a", "hash_input_tokens": "831ff35c474e5cef", "hash_cont_tokens": "bf39544be0ebf000" }, "truncated": 0, "non-truncated": 812, "padded": 812, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-high_school_computer_science|5": { "hashes": { "hash_examples": "8b8cdb1084f24169", "hash_full_prompts": "c00487e67c1813cc", "hash_input_tokens": "8c34e0f2bda77358", "hash_cont_tokens": "43570b3948564b64" }, "truncated": 0, "non-truncated": 400, "padded": 400, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-high_school_european_history|5": { "hashes": { "hash_examples": "11cd32d0ef440171", "hash_full_prompts": "318f4513c537c6bf", "hash_input_tokens": "f1f73dd687da18d7", "hash_cont_tokens": "674fc454bdc5ac93" }, "truncated": 660, "non-truncated": 0, "padded": 0, "non-padded": 660, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-high_school_geography|5": { "hashes": { "hash_examples": "b60019b9e80b642f", "hash_full_prompts": "ee5789fcc1a81b1e", "hash_input_tokens": "7c5547c7da5bc793", "hash_cont_tokens": "03a5012b916274ea" }, "truncated": 0, "non-truncated": 792, "padded": 792, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "hashes": { "hash_examples": "d221ec983d143dc3", "hash_full_prompts": "ac42d888e1ce1155", "hash_input_tokens": "f62991cb6a496b05", "hash_cont_tokens": "50ab225c2f535210" }, "truncated": 0, "non-truncated": 772, "padded": 772, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "hashes": { "hash_examples": "59c2915cacfd3fbb", "hash_full_prompts": "c6bd9d25158abd0e", "hash_input_tokens": "4cef2aff6e3d59ed", "hash_cont_tokens": "c583432ad27fcfe0" }, "truncated": 0, "non-truncated": 1560, "padded": 1560, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-high_school_mathematics|5": { "hashes": { "hash_examples": "1f8ac897608de342", "hash_full_prompts": "5d88f41fc2d643a8", "hash_input_tokens": "6e2577ea4082ed2b", "hash_cont_tokens": "1194078d4e38c984" }, "truncated": 0, "non-truncated": 1080, "padded": 1080, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-high_school_microeconomics|5": { "hashes": { "hash_examples": "ead6a0f2f6c83370", "hash_full_prompts": "bfc393381298609e", "hash_input_tokens": "c5fc9aeb1079c8e4", "hash_cont_tokens": "f47f041de50333b9" }, "truncated": 0, "non-truncated": 952, "padded": 952, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-high_school_physics|5": { "hashes": { "hash_examples": "c3f2025990afec64", "hash_full_prompts": "fc78b4997e436734", "hash_input_tokens": "555fc385cffa84ca", "hash_cont_tokens": "6296151cf7fee15c" }, "truncated": 0, "non-truncated": 604, "padded": 604, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-high_school_psychology|5": { "hashes": { "hash_examples": "21f8aab618f6d636", "hash_full_prompts": "d5c76aa40b9dbc43", "hash_input_tokens": "febd23cbf9973b7f", "hash_cont_tokens": "a490d3db0ea5935a" }, "truncated": 0, "non-truncated": 2180, "padded": 2180, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-high_school_statistics|5": { "hashes": { "hash_examples": "2386a60a11fc5de3", "hash_full_prompts": "4c5c8be5aafac432", "hash_input_tokens": "424b02981230ee83", "hash_cont_tokens": "6830ef7d0325d7ef" }, "truncated": 0, "non-truncated": 864, "padded": 864, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-high_school_us_history|5": { "hashes": { "hash_examples": "74961543be40f04f", "hash_full_prompts": "5d5ca4840131ba21", "hash_input_tokens": "50c9ff438c85a69e", "hash_cont_tokens": "cdd0b3dc06d933e5" }, "truncated": 816, "non-truncated": 0, "padded": 0, "non-padded": 816, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-high_school_world_history|5": { "hashes": { "hash_examples": "2ad2f6b7198b2234", "hash_full_prompts": "11845057459afd72", "hash_input_tokens": "054824cc474caef5", "hash_cont_tokens": "e0203e3fc1bb0500" }, "truncated": 8, "non-truncated": 940, "padded": 940, "non-padded": 8, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-human_aging|5": { "hashes": { "hash_examples": "1a7199dc733e779b", "hash_full_prompts": "756b9096b8eaf892", "hash_input_tokens": "541a75f071dcf579", "hash_cont_tokens": "142a4a8a1138a214" }, "truncated": 0, "non-truncated": 892, "padded": 892, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-human_sexuality|5": { "hashes": { "hash_examples": "7acb8fdad97f88a6", "hash_full_prompts": "731a52ff15b8cfdb", "hash_input_tokens": "04269e5c5a257dd9", "hash_cont_tokens": "bc54813e809b796d" }, "truncated": 0, "non-truncated": 524, "padded": 524, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-international_law|5": { "hashes": { "hash_examples": "1300bfd0dfc59114", "hash_full_prompts": "db2aefbff5eec996", "hash_input_tokens": "d93ba9d9d38e4397", "hash_cont_tokens": "63435df622d5437b" }, "truncated": 0, "non-truncated": 484, "padded": 484, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-jurisprudence|5": { "hashes": { "hash_examples": "083b1e4904c48dc2", "hash_full_prompts": "0f89ee3fe03d6a21", "hash_input_tokens": "9eeaccd2698b4f5a", "hash_cont_tokens": "e3a8cd951b6e3469" }, "truncated": 0, "non-truncated": 432, "padded": 432, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-logical_fallacies|5": { "hashes": { "hash_examples": "709128f9926a634c", "hash_full_prompts": "98a04b1f8f841069", "hash_input_tokens": "b4f08f544f2b7576", "hash_cont_tokens": "5e6ee2ff0404f23c" }, "truncated": 0, "non-truncated": 652, "padded": 648, "non-padded": 4, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-machine_learning|5": { "hashes": { "hash_examples": "88f22a636029ae47", "hash_full_prompts": "2e1c8d4b1e0cc921", "hash_input_tokens": "900c2a51f1174b9f", "hash_cont_tokens": "c81919424db3b267" }, "truncated": 0, "non-truncated": 448, "padded": 448, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-management|5": { "hashes": { "hash_examples": "8c8a1e07a2151dca", "hash_full_prompts": "f51611f514b265b0", "hash_input_tokens": "6b36efb4689c6eca", "hash_cont_tokens": "a01d6d39a83c4597" }, "truncated": 0, "non-truncated": 412, "padded": 412, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-marketing|5": { "hashes": { "hash_examples": "2668953431f91e96", "hash_full_prompts": "77562bef997c7650", "hash_input_tokens": "2aaac78a0cfed47a", "hash_cont_tokens": "6aeaed4d823c98aa" }, "truncated": 0, "non-truncated": 936, "padded": 936, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-medical_genetics|5": { "hashes": { "hash_examples": "9c2dda34a2ea4fd2", "hash_full_prompts": "202139046daa118f", "hash_input_tokens": "886ca823b41c094a", "hash_cont_tokens": "50421e30bef398f9" }, "truncated": 0, "non-truncated": 400, "padded": 400, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-miscellaneous|5": { "hashes": { "hash_examples": "41adb694024809c2", "hash_full_prompts": "bffec9fc237bcf93", "hash_input_tokens": "72fd71de7675e7d0", "hash_cont_tokens": "9b0ab02a64603081" }, "truncated": 0, "non-truncated": 3132, "padded": 3132, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-moral_disputes|5": { "hashes": { "hash_examples": "3171c13ba3c594c4", "hash_full_prompts": "170831fc36f1d59e", "hash_input_tokens": "f3ca0dd8e7a1eb09", "hash_cont_tokens": "3b8bbe9108e55ce9" }, "truncated": 0, "non-truncated": 1384, "padded": 1354, "non-padded": 30, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-moral_scenarios|5": { "hashes": { "hash_examples": "9873e077e83e0546", "hash_full_prompts": "08f4ceba3131a068", "hash_input_tokens": "3e793631e951f23c", "hash_cont_tokens": "2eae753a177d5460" }, "truncated": 0, "non-truncated": 3580, "padded": 3580, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-nutrition|5": { "hashes": { "hash_examples": "7db1d8142ec14323", "hash_full_prompts": "4c0e68e3586cb453", "hash_input_tokens": "59753c2144ea93af", "hash_cont_tokens": "29771089bd3c65c6" }, "truncated": 0, "non-truncated": 1224, "padded": 1224, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-philosophy|5": { "hashes": { "hash_examples": "9b455b7d72811cc8", "hash_full_prompts": "e467f822d8a0d3ff", "hash_input_tokens": "bd8d3dbed15a8c34", "hash_cont_tokens": "9f6ff69d23a48783" }, "truncated": 0, "non-truncated": 1244, "padded": 1244, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-prehistory|5": { "hashes": { "hash_examples": "8be90d0f538f1560", "hash_full_prompts": "152187949bcd0921", "hash_input_tokens": "3573cd87facbb7c5", "hash_cont_tokens": "a789a13af22308bf" }, "truncated": 0, "non-truncated": 1296, "padded": 1296, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-professional_accounting|5": { "hashes": { "hash_examples": "8d377597916cd07e", "hash_full_prompts": "0eb7345d6144ee0d", "hash_input_tokens": "17e721bc1a7cbb47", "hash_cont_tokens": "5129a9cfb30c5239" }, "truncated": 0, "non-truncated": 1128, "padded": 1128, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-professional_law|5": { "hashes": { "hash_examples": "cd9dbc52b3c932d6", "hash_full_prompts": "36ac764272bfb182", "hash_input_tokens": "9178e10bd0763ec4", "hash_cont_tokens": "2e590029ef41fbcd" }, "truncated": 604, "non-truncated": 5532, "padded": 5524, "non-padded": 612, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-professional_medicine|5": { "hashes": { "hash_examples": "b20e4e816c1e383e", "hash_full_prompts": "7b8d69ea2acaf2f7", "hash_input_tokens": "f5a22012a54f70ea", "hash_cont_tokens": "cd82e108370cece8" }, "truncated": 0, "non-truncated": 1088, "padded": 1088, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-professional_psychology|5": { "hashes": { "hash_examples": "d45b73b22f9cc039", "hash_full_prompts": "fe8937e9ffc99771", "hash_input_tokens": "0dfb73a8eb3f692c", "hash_cont_tokens": "61ef0c8a87f9c92d" }, "truncated": 0, "non-truncated": 2448, "padded": 2448, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-public_relations|5": { "hashes": { "hash_examples": "0d25072e1761652a", "hash_full_prompts": "f9adc39cfa9f42ba", "hash_input_tokens": "1710c6ba4c9f3cbd", "hash_cont_tokens": "568f585a259965c1" }, "truncated": 0, "non-truncated": 440, "padded": 440, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-security_studies|5": { "hashes": { "hash_examples": "62bb8197e63d60d4", "hash_full_prompts": "869c9c3ae196b7c3", "hash_input_tokens": "d49711415961ced7", "hash_cont_tokens": "d70cfe096d4fb7bd" }, "truncated": 0, "non-truncated": 980, "padded": 980, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-sociology|5": { "hashes": { "hash_examples": "e7959df87dea8672", "hash_full_prompts": "1a1fc00e17b3a52a", "hash_input_tokens": "828999f7624cbe7e", "hash_cont_tokens": "c3a3bdfd177eed5b" }, "truncated": 0, "non-truncated": 804, "padded": 804, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-us_foreign_policy|5": { "hashes": { "hash_examples": "4a56a01ddca44dca", "hash_full_prompts": "0c7a7081c71c07b6", "hash_input_tokens": "42054621e718dbee", "hash_cont_tokens": "2568d0e8e36fa959" }, "truncated": 0, "non-truncated": 400, "padded": 400, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-virology|5": { "hashes": { "hash_examples": "451cc86a8c4f4fe9", "hash_full_prompts": "01e95325d8b738e4", "hash_input_tokens": "6c4f0aa4dc859c04", "hash_cont_tokens": "c178cccd753d9bc5" }, "truncated": 0, "non-truncated": 664, "padded": 664, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|hendrycksTest-world_religions|5": { "hashes": { "hash_examples": "3b29cfaf1a81c379", "hash_full_prompts": "e0d79a15083dfdff", "hash_input_tokens": "6c75d44e092ff24f", "hash_cont_tokens": "0a3a3ea5ef49d19c" }, "truncated": 0, "non-truncated": 684, "padded": 684, "non-padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "harness|truthfulqa:mc|0": { "hashes": { "hash_examples": "23176c0531c7b867", "hash_full_prompts": "36a6d90e75d92d4a", "hash_input_tokens": "2738d7ed7075faa7", "hash_cont_tokens": "6d1691881e252df0" }, "truncated": 0, "non-truncated": 9996, "padded": 9996, "non-padded": 0, "effective_few_shots": 0.0, "num_truncated_few_shots": 0 } }, "summary_general": { "hashes": { "hash_examples": "d84d18e9a963753d", "hash_full_prompts": "12b540783521a8e6", "hash_input_tokens": "6fecf578c508db6a", "hash_cont_tokens": "f4b7b7f3a2788768" }, "total_evaluation_time_secondes": "4756.200440645218", "truncated": 2088, "non-truncated": 108931, "padded": 108834, "non-padded": 2185, "num_truncated_few_shots": 0 } }