File size: 5,758 Bytes
f43c1aa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
| Tasks |Version|Filter|n-shot|Metric|Value | |Stderr|
|---------------------------------------|-------|------|-----:|------|-----:|---|-----:|
|mmlu |N/A |none | 0|acc |0.4857|± |0.0041|
| - humanities |N/A |none | 0|acc |0.4612|± |0.0069|
| - formal_logic | 0|none | 0|acc |0.2937|± |0.0407|
| - high_school_european_history | 0|none | 0|acc |0.6364|± |0.0376|
| - high_school_us_history | 0|none | 0|acc |0.6765|± |0.0328|
| - high_school_world_history | 0|none | 0|acc |0.6540|± |0.0310|
| - international_law | 0|none | 0|acc |0.6529|± |0.0435|
| - jurisprudence | 0|none | 0|acc |0.5741|± |0.0478|
| - logical_fallacies | 0|none | 0|acc |0.5031|± |0.0393|
| - moral_disputes | 0|none | 0|acc |0.5029|± |0.0269|
| - moral_scenarios | 0|none | 0|acc |0.2503|± |0.0145|
| - philosophy | 0|none | 0|acc |0.6206|± |0.0276|
| - prehistory | 0|none | 0|acc |0.5957|± |0.0273|
| - professional_law | 0|none | 0|acc |0.3950|± |0.0125|
| - world_religions | 0|none | 0|acc |0.7135|± |0.0347|
| - other |N/A |none | 0|acc |0.5423|± |0.0087|
| - business_ethics | 0|none | 0|acc |0.5100|± |0.0502|
| - clinical_knowledge | 0|none | 0|acc |0.4528|± |0.0306|
| - college_medicine | 0|none | 0|acc |0.4509|± |0.0379|
| - global_facts | 0|none | 0|acc |0.3200|± |0.0469|
| - human_aging | 0|none | 0|acc |0.5561|± |0.0333|
| - management | 0|none | 0|acc |0.6408|± |0.0475|
| - marketing | 0|none | 0|acc |0.7094|± |0.0297|
| - medical_genetics | 0|none | 0|acc |0.5100|± |0.0502|
| - miscellaneous | 0|none | 0|acc |0.6692|± |0.0168|
| - nutrition | 0|none | 0|acc |0.5163|± |0.0286|
| - professional_accounting | 0|none | 0|acc |0.3865|± |0.0290|
| - professional_medicine | 0|none | 0|acc |0.4963|± |0.0304|
| - virology | 0|none | 0|acc |0.4277|± |0.0385|
| - social_sciences |N/A |none | 0|acc |0.5626|± |0.0088|
| - econometrics | 0|none | 0|acc |0.2018|± |0.0378|
| - high_school_geography | 0|none | 0|acc |0.5707|± |0.0353|
| - high_school_government_and_politics| 0|none | 0|acc |0.6736|± |0.0338|
| - high_school_macroeconomics | 0|none | 0|acc |0.4923|± |0.0253|
| - high_school_microeconomics | 0|none | 0|acc |0.5000|± |0.0325|
| - high_school_psychology | 0|none | 0|acc |0.6312|± |0.0207|
| - human_sexuality | 0|none | 0|acc |0.6183|± |0.0426|
| - professional_psychology | 0|none | 0|acc |0.5049|± |0.0202|
| - public_relations | 0|none | 0|acc |0.5636|± |0.0475|
| - security_studies | 0|none | 0|acc |0.6041|± |0.0313|
| - sociology | 0|none | 0|acc |0.6915|± |0.0327|
| - us_foreign_policy | 0|none | 0|acc |0.7100|± |0.0456|
| - stem |N/A |none | 0|acc |0.3914|± |0.0085|
| - abstract_algebra | 0|none | 0|acc |0.2900|± |0.0456|
| - anatomy | 0|none | 0|acc |0.4741|± |0.0431|
| - astronomy | 0|none | 0|acc |0.5263|± |0.0406|
| - college_biology | 0|none | 0|acc |0.4653|± |0.0417|
| - college_chemistry | 0|none | 0|acc |0.3800|± |0.0488|
| - college_computer_science | 0|none | 0|acc |0.4300|± |0.0498|
| - college_mathematics | 0|none | 0|acc |0.3300|± |0.0473|
| - college_physics | 0|none | 0|acc |0.2647|± |0.0439|
| - computer_security | 0|none | 0|acc |0.6600|± |0.0476|
| - conceptual_physics | 0|none | 0|acc |0.3319|± |0.0308|
| - electrical_engineering | 0|none | 0|acc |0.4552|± |0.0415|
| - elementary_mathematics | 0|none | 0|acc |0.3069|± |0.0238|
| - high_school_biology | 0|none | 0|acc |0.5645|± |0.0282|
| - high_school_chemistry | 0|none | 0|acc |0.3842|± |0.0342|
| - high_school_computer_science | 0|none | 0|acc |0.4700|± |0.0502|
| - high_school_mathematics | 0|none | 0|acc |0.3000|± |0.0279|
| - high_school_physics | 0|none | 0|acc |0.2252|± |0.0341|
| - high_school_statistics | 0|none | 0|acc |0.3796|± |0.0331|
| - machine_learning | 0|none | 0|acc |0.2679|± |0.0420|
|