Update README.md
Browse files
README.md
CHANGED
@@ -62,16 +62,80 @@ Nous-Hermes 2 on SOLAR 10.7B is a major improvement across the board on the benc
|
|
62 |
# Benchmarks Compared
|
63 |
|
64 |
GPT4All:
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
AGIEval:
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
BigBench:
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
TruthfulQA:
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
## GPT4All
|
77 |
|
|
|
62 |
# Benchmarks Compared
|
63 |
|
64 |
GPT4All:
|
65 |
+
```
|
66 |
+
| Task |Version| Metric |Value | |Stderr|
|
67 |
+
|-------------|------:|--------|-----:|---|-----:|
|
68 |
+
|arc_challenge| 0|acc |0.5990|± |0.0143|
|
69 |
+
| | |acc_norm|0.6425|± |0.0140|
|
70 |
+
|arc_easy | 0|acc |0.8657|± |0.0070|
|
71 |
+
| | |acc_norm|0.8636|± |0.0070|
|
72 |
+
|boolq | 1|acc |0.8783|± |0.0057|
|
73 |
+
|hellaswag | 0|acc |0.6661|± |0.0047|
|
74 |
+
| | |acc_norm|0.8489|± |0.0036|
|
75 |
+
|openbookqa | 0|acc |0.3440|± |0.0213|
|
76 |
+
| | |acc_norm|0.4660|± |0.0223|
|
77 |
+
|piqa | 0|acc |0.8324|± |0.0087|
|
78 |
+
| | |acc_norm|0.8379|± |0.0086|
|
79 |
+
|winogrande | 0|acc |0.7616|± |0.0120|
|
80 |
+
```
|
81 |
+
Average: 75.70
|
82 |
|
83 |
AGIEval:
|
84 |
+
```
|
85 |
+
| Task |Version| Metric |Value | |Stderr|
|
86 |
+
|------------------------------|------:|--------|-----:|---|-----:|
|
87 |
+
|agieval_aqua_rat | 0|acc |0.2402|± |0.0269|
|
88 |
+
| | |acc_norm|0.2520|± |0.0273|
|
89 |
+
|agieval_logiqa_en | 0|acc |0.4117|± |0.0193|
|
90 |
+
| | |acc_norm|0.4055|± |0.0193|
|
91 |
+
|agieval_lsat_ar | 0|acc |0.2348|± |0.0280|
|
92 |
+
| | |acc_norm|0.2087|± |0.0269|
|
93 |
+
|agieval_lsat_lr | 0|acc |0.5549|± |0.0220|
|
94 |
+
| | |acc_norm|0.5294|± |0.0221|
|
95 |
+
|agieval_lsat_rc | 0|acc |0.6617|± |0.0289|
|
96 |
+
| | |acc_norm|0.6357|± |0.0294|
|
97 |
+
|agieval_sat_en | 0|acc |0.8010|± |0.0279|
|
98 |
+
| | |acc_norm|0.7913|± |0.0284|
|
99 |
+
|agieval_sat_en_without_passage| 0|acc |0.4806|± |0.0349|
|
100 |
+
| | |acc_norm|0.4612|± |0.0348|
|
101 |
+
|agieval_sat_math | 0|acc |0.4909|± |0.0338|
|
102 |
+
| | |acc_norm|0.4000|± |0.0331|
|
103 |
+
```
|
104 |
+
Average: 46.05
|
105 |
|
106 |
BigBench:
|
107 |
+
```
|
108 |
+
| Task |Version| Metric |Value | |Stderr|
|
109 |
+
|------------------------------------------------|------:|---------------------|-----:|---|-----:|
|
110 |
+
|bigbench_causal_judgement | 0|multiple_choice_grade|0.6105|± |0.0355|
|
111 |
+
|bigbench_date_understanding | 0|multiple_choice_grade|0.7182|± |0.0235|
|
112 |
+
|bigbench_disambiguation_qa | 0|multiple_choice_grade|0.5736|± |0.0308|
|
113 |
+
|bigbench_geometric_shapes | 0|multiple_choice_grade|0.4596|± |0.0263|
|
114 |
+
| | |exact_str_match |0.0000|± |0.0000|
|
115 |
+
|bigbench_logical_deduction_five_objects | 0|multiple_choice_grade|0.3500|± |0.0214|
|
116 |
+
|bigbench_logical_deduction_seven_objects | 0|multiple_choice_grade|0.2500|± |0.0164|
|
117 |
+
|bigbench_logical_deduction_three_objects | 0|multiple_choice_grade|0.5200|± |0.0289|
|
118 |
+
|bigbench_movie_recommendation | 0|multiple_choice_grade|0.3540|± |0.0214|
|
119 |
+
|bigbench_navigate | 0|multiple_choice_grade|0.5000|± |0.0158|
|
120 |
+
|bigbench_reasoning_about_colored_objects | 0|multiple_choice_grade|0.6900|± |0.0103|
|
121 |
+
|bigbench_ruin_names | 0|multiple_choice_grade|0.6317|± |0.0228|
|
122 |
+
|bigbench_salient_translation_error_detection | 0|multiple_choice_grade|0.2535|± |0.0138|
|
123 |
+
|bigbench_snarks | 0|multiple_choice_grade|0.7293|± |0.0331|
|
124 |
+
|bigbench_sports_understanding | 0|multiple_choice_grade|0.6744|± |0.0149|
|
125 |
+
|bigbench_temporal_sequences | 0|multiple_choice_grade|0.7400|± |0.0139|
|
126 |
+
|bigbench_tracking_shuffled_objects_five_objects | 0|multiple_choice_grade|0.2176|± |0.0117|
|
127 |
+
|bigbench_tracking_shuffled_objects_seven_objects| 0|multiple_choice_grade|0.1543|± |0.0086|
|
128 |
+
|bigbench_tracking_shuffled_objects_three_objects| 0|multiple_choice_grade|0.5200|± |0.0289|
|
129 |
+
```
|
130 |
+
Average: 49.70
|
131 |
|
132 |
TruthfulQA:
|
133 |
+
```
|
134 |
+
| Task |Version|Metric|Value | |Stderr|
|
135 |
+
|-------------|------:|------|-----:|---|-----:|
|
136 |
+
|truthfulqa_mc| 1|mc1 |0.4162|± |0.0173|
|
137 |
+
| | |mc2 |0.5783|± |0.0151|
|
138 |
+
```
|
139 |
|
140 |
## GPT4All
|
141 |
|