Commit
Β·
b851397
1
Parent(s):
a1243c9
Add eval
Browse filesThis view is limited to 50 files because it contains too many changes. Β
See raw diff
- 4b284b12bc4/evaluation/4b284b12bc4_0_lm-eval_global_step80108_2023-01-30-11-23-34_0shots_backup.json +0 -87
- 4b284b12bc4/evaluation/4b284b12bc4_1_lm-eval_global_step80108_2023-01-30-11-26-32_1shots_backup.json +0 -87
- 4b284b12bc4/evaluation/4b284b12bc4_2_lm-eval_global_step80108_2023-01-30-11-26-32_2shots_backup.json +0 -87
- 4b284b12bc4/evaluation/4b284b12bc4_3_lm-eval_global_step80108_2023-01-30-11-26-31_3shots_backup.json +0 -87
- 4b284b12bc4/evaluation/4b284b12bc4_4_lm-eval_global_step80108_2023-01-30-11-26-32_4shots_backup.json +0 -87
- 4b284b12bc4/evaluation/4b284b12bc4_5_lm-eval_global_step80108_2023-01-30-11-26-32_5shots_backup.json +0 -66
- 4b284b12bc4/evaluation/{4b284b12bc4_0.json β rankeval/4b284b12bc4_0.json} +0 -0
- 4b284b12bc4/evaluation/{4b284b12bc4_1.json β rankeval/4b284b12bc4_1.json} +0 -0
- 4b284b12bc4/evaluation/{4b284b12bc4_2.json β rankeval/4b284b12bc4_2.json} +0 -0
- 4b284b12bc4/evaluation/{4b284b12bc4_3.json β rankeval/4b284b12bc4_3.json} +0 -0
- 4b284b12bc4/evaluation/{4b284b12bc4_4.json β rankeval/4b284b12bc4_4.json} +0 -0
- 4b284b12bc4/evaluation/{4b284b12bc4_5.json β rankeval/4b284b12bc4_5.json} +22 -1
- 4b284b17bc4/evaluation/4b284b17bc4_0_lm-eval_global_step80108_2023-01-30-11-26-40_0shots_backup.json +0 -87
- 4b284b17bc4/evaluation/4b284b17bc4_1_lm-eval_global_step80108_2023-01-30-11-26-39_1shots_backup.json +0 -87
- 4b284b17bc4/evaluation/4b284b17bc4_2_lm-eval_global_step80108_2023-01-30-11-26-39_2shots_backup.json +0 -87
- 4b284b17bc4/evaluation/4b284b17bc4_3_lm-eval_global_step80108_2023-01-30-11-26-39_3shots_backup.json +0 -87
- 4b284b17bc4/evaluation/4b284b17bc4_4_lm-eval_global_step80108_2023-01-30-11-26-39_4shots_backup.json +0 -87
- 4b284b17bc4/evaluation/4b284b17bc4_5_lm-eval_global_step80108_2023-01-30-11-26-39_5shots_backup.json +0 -73
- 4b284b17bc4/evaluation/{4b284b17bc4_0.json β rankeval/4b284b17bc4_0.json} +0 -0
- 4b284b17bc4/evaluation/{4b284b17bc4_1.json β rankeval/4b284b17bc4_1.json} +0 -0
- 4b284b17bc4/evaluation/{4b284b17bc4_2.json β rankeval/4b284b17bc4_2.json} +0 -0
- 4b284b17bc4/evaluation/{4b284b17bc4_3.json β rankeval/4b284b17bc4_3.json} +0 -0
- 4b284b17bc4/evaluation/{4b284b17bc4_4.json β rankeval/4b284b17bc4_4.json} +0 -0
- 4b284b17bc4/evaluation/{4b284b17bc4_5.json β rankeval/4b284b17bc4_5.json} +15 -1
- 4b284b21bc4/evaluation/4b284b21bc4_0_lm-eval_global_step80108_2023-01-30-11-26-38_0shots_backup.json +0 -87
- 4b284b21bc4/evaluation/4b284b21bc4_1_lm-eval_global_step80108_2023-01-30-11-26-38_1shots_backup.json +0 -87
- 4b284b21bc4/evaluation/4b284b21bc4_2_lm-eval_global_step80108_2023-01-30-11-26-38_2shots_backup.json +0 -87
- 4b284b21bc4/evaluation/4b284b21bc4_3_lm-eval_global_step80108_2023-01-30-11-26-38_3shots_backup.json +0 -87
- 4b284b21bc4/evaluation/4b284b21bc4_4_lm-eval_global_step80108_2023-01-30-11-26-38_4shots_backup.json +0 -87
- 4b284b21bc4/evaluation/4b284b21bc4_5_lm-eval_global_step80108_2023-01-30-11-26-38_5shots_backup.json +0 -66
- 4b284b21bc4/evaluation/{4b284b21bc4_0.json β rankeval/4b284b21bc4_0.json} +0 -0
- 4b284b21bc4/evaluation/{4b284b21bc4_1.json β rankeval/4b284b21bc4_1.json} +0 -0
- 4b284b21bc4/evaluation/{4b284b21bc4_2.json β rankeval/4b284b21bc4_2.json} +0 -0
- 4b284b21bc4/evaluation/{4b284b21bc4_3.json β rankeval/4b284b21bc4_3.json} +0 -0
- 4b284b21bc4/evaluation/{4b284b21bc4_4.json β rankeval/4b284b21bc4_4.json} +0 -0
- 4b284b21bc4/evaluation/{4b284b21bc4_5.json β rankeval/4b284b21bc4_5.json} +22 -1
- 4b284b28bc4/evaluation/4b284b28bc4_0_lm-eval_global_step80108_2023-01-30-11-26-39_0shots_backup.json +0 -87
- 4b284b28bc4/evaluation/4b284b28bc4_1_lm-eval_global_step80108_2023-01-30-11-26-39_1shots_backup.json +0 -87
- 4b284b28bc4/evaluation/4b284b28bc4_2_lm-eval_global_step80108_2023-01-30-11-26-39_2shots_backup.json +0 -87
- 4b284b28bc4/evaluation/4b284b28bc4_3_lm-eval_global_step80108_2023-01-30-11-26-39_3shots_backup.json +0 -87
- 4b284b28bc4/evaluation/4b284b28bc4_4_lm-eval_global_step80108_2023-01-30-11-26-39_4shots_backup.json +0 -87
- 4b284b28bc4/evaluation/4b284b28bc4_5_lm-eval_global_step80108_2023-01-30-11-26-39_5shots_backup.json +0 -59
- 4b284b28bc4/evaluation/{4b284b28bc4_0.json β rankeval/4b284b28bc4_0.json} +0 -0
- 4b284b28bc4/evaluation/{4b284b28bc4_1.json β rankeval/4b284b28bc4_1.json} +0 -0
- 4b284b28bc4/evaluation/{4b284b28bc4_2.json β rankeval/4b284b28bc4_2.json} +0 -0
- 4b284b28bc4/evaluation/{4b284b28bc4_3.json β rankeval/4b284b28bc4_3.json} +0 -0
- 4b284b28bc4/evaluation/{4b284b28bc4_4.json β rankeval/4b284b28bc4_4.json} +0 -0
- 4b284b28bc4/evaluation/{4b284b28bc4_5.json β rankeval/4b284b28bc4_5.json} +29 -1
- 4b284b42bc4/evaluation/4b284b42bc4_0_lm-eval_global_step80108_2023-01-30-11-26-38_0shots_backup.json +0 -87
- 4b284b42bc4/evaluation/4b284b42bc4_1_lm-eval_global_step80108_2023-01-30-11-26-38_1shots_backup.json +0 -87
4b284b12bc4/evaluation/4b284b12bc4_0_lm-eval_global_step80108_2023-01-30-11-23-34_0shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.335,
|
5 |
-
"acc_stderr": 0.014933117490932575
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.334,
|
9 |
-
"acc_stderr": 0.014922019523732961
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3491666666666667,
|
13 |
-
"acc_stderr": 0.013767075395077249
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.39285714285714285,
|
17 |
-
"acc_stderr": 0.0658538889806635,
|
18 |
-
"f1": 0.23306878306878312
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.77,
|
22 |
-
"acc_stderr": 0.04229525846816506
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4695279824736108,
|
26 |
-
"acc_stderr": 0.0049805063294075845,
|
27 |
-
"acc_norm": 0.6132244572794264,
|
28 |
-
"acc_norm_stderr": 0.004860162076330956
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5812274368231047,
|
32 |
-
"acc_stderr": 0.02969666108123484
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5753749013417522,
|
36 |
-
"acc_stderr": 0.013891893150264218
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.711918760021379,
|
40 |
-
"acc_stderr": 0.010472537019822578
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5464831804281346,
|
44 |
-
"acc_stderr": 0.008707182331111644
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5538720538720538,
|
48 |
-
"acc_stderr": 0.01020005782876501,
|
49 |
-
"acc_norm": 0.4936868686868687,
|
50 |
-
"acc_norm_stderr": 0.01025896566804443
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2636518771331058,
|
54 |
-
"acc_stderr": 0.012875929151297049,
|
55 |
-
"acc_norm": 0.2883959044368601,
|
56 |
-
"acc_norm_stderr": 0.013238394422428175
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.82,
|
60 |
-
"acc_stderr": 0.012155153135511965,
|
61 |
-
"acc_norm": 0.749,
|
62 |
-
"acc_norm_stderr": 0.013718133516888921
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.73449401523395,
|
66 |
-
"acc_stderr": 0.010303308653024429,
|
67 |
-
"acc_norm": 0.7475516866158868,
|
68 |
-
"acc_norm_stderr": 0.010135665547362354
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b12bc4/evaluation/4b284b12bc4_1_lm-eval_global_step80108_2023-01-30-11-26-32_1shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.333,
|
5 |
-
"acc_stderr": 0.014910846164229868
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.326,
|
9 |
-
"acc_stderr": 0.01483050720454104
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3475,
|
13 |
-
"acc_stderr": 0.013751753243291852
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.5357142857142857,
|
17 |
-
"acc_stderr": 0.06724777654937658,
|
18 |
-
"f1": 0.37227304714989445
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.79,
|
22 |
-
"acc_stderr": 0.040936018074033256
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.47191794463254333,
|
26 |
-
"acc_stderr": 0.004981905293878145,
|
27 |
-
"acc_norm": 0.6139215295757817,
|
28 |
-
"acc_norm_stderr": 0.004858539527872466
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5703971119133574,
|
32 |
-
"acc_stderr": 0.029796668829124674
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5706393054459353,
|
36 |
-
"acc_stderr": 0.013911537499969163
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7151256012827365,
|
40 |
-
"acc_stderr": 0.01043751398661172
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5669724770642202,
|
44 |
-
"acc_stderr": 0.00866625130551806
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5913299663299664,
|
48 |
-
"acc_stderr": 0.010087174498762883,
|
49 |
-
"acc_norm": 0.5496632996632996,
|
50 |
-
"acc_norm_stderr": 0.010209047724374145
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2627986348122867,
|
54 |
-
"acc_stderr": 0.012862523175351333,
|
55 |
-
"acc_norm": 0.30716723549488056,
|
56 |
-
"acc_norm_stderr": 0.013481034054980943
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.836,
|
60 |
-
"acc_stderr": 0.011715000693181331,
|
61 |
-
"acc_norm": 0.781,
|
62 |
-
"acc_norm_stderr": 0.013084731950262012
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7448313384113167,
|
66 |
-
"acc_stderr": 0.010171571592521822,
|
67 |
-
"acc_norm": 0.7535364526659413,
|
68 |
-
"acc_norm_stderr": 0.01005481078967181
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b12bc4/evaluation/4b284b12bc4_2_lm-eval_global_step80108_2023-01-30-11-26-32_2shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.329,
|
5 |
-
"acc_stderr": 0.014865395385928354
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.336,
|
9 |
-
"acc_stderr": 0.014944140233795027
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3383333333333333,
|
13 |
-
"acc_stderr": 0.013664144006618266
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.48214285714285715,
|
17 |
-
"acc_stderr": 0.06737697508644648,
|
18 |
-
"f1": 0.3338011695906433
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.79,
|
22 |
-
"acc_stderr": 0.040936018074033256
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4697271459868552,
|
26 |
-
"acc_stderr": 0.004980627287147585,
|
27 |
-
"acc_norm": 0.6141206930890261,
|
28 |
-
"acc_norm_stderr": 0.004858074013443988
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5523465703971119,
|
32 |
-
"acc_stderr": 0.02993107036293953
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.574585635359116,
|
36 |
-
"acc_stderr": 0.013895257666646378
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7156600748262961,
|
40 |
-
"acc_stderr": 0.010431614128665253
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5660550458715596,
|
44 |
-
"acc_stderr": 0.008668405003744129
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5993265993265994,
|
48 |
-
"acc_stderr": 0.01005530447425557,
|
49 |
-
"acc_norm": 0.5576599326599326,
|
50 |
-
"acc_norm_stderr": 0.01019133444422085
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2781569965870307,
|
54 |
-
"acc_stderr": 0.013094469919538805,
|
55 |
-
"acc_norm": 0.30887372013651876,
|
56 |
-
"acc_norm_stderr": 0.013501770929344003
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.835,
|
60 |
-
"acc_stderr": 0.011743632866916145,
|
61 |
-
"acc_norm": 0.79,
|
62 |
-
"acc_norm_stderr": 0.01288666233227453
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7470076169749728,
|
66 |
-
"acc_stderr": 0.01014288869886246,
|
67 |
-
"acc_norm": 0.7519042437431991,
|
68 |
-
"acc_norm_stderr": 0.010077118315574706
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b12bc4/evaluation/4b284b12bc4_3_lm-eval_global_step80108_2023-01-30-11-26-31_3shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.332,
|
5 |
-
"acc_stderr": 0.014899597242811485
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.334,
|
9 |
-
"acc_stderr": 0.014922019523732963
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.35,
|
13 |
-
"acc_stderr": 0.013774667009018554
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.6071428571428571,
|
17 |
-
"acc_stderr": 0.0658538889806635,
|
18 |
-
"f1": 0.42400932400932395
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.81,
|
22 |
-
"acc_stderr": 0.03942772444036622
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.47241585341565423,
|
26 |
-
"acc_stderr": 0.004982182323923561,
|
27 |
-
"acc_norm": 0.6199960167297351,
|
28 |
-
"acc_norm_stderr": 0.004843954338451449
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5379061371841155,
|
32 |
-
"acc_stderr": 0.030009848912529113
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5737963693764798,
|
36 |
-
"acc_stderr": 0.013898585965412338
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7124532335649385,
|
40 |
-
"acc_stderr": 0.010466744473098363
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5587155963302752,
|
44 |
-
"acc_stderr": 0.008684548127832637
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5955387205387206,
|
48 |
-
"acc_stderr": 0.010070746648278783,
|
49 |
-
"acc_norm": 0.5740740740740741,
|
50 |
-
"acc_norm_stderr": 0.010146568651002255
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2815699658703072,
|
54 |
-
"acc_stderr": 0.013143376735009022,
|
55 |
-
"acc_norm": 0.3122866894197952,
|
56 |
-
"acc_norm_stderr": 0.013542598541688067
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.841,
|
60 |
-
"acc_stderr": 0.01156947936827129,
|
61 |
-
"acc_norm": 0.796,
|
62 |
-
"acc_norm_stderr": 0.012749374359024384
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7513601741022851,
|
66 |
-
"acc_stderr": 0.01008451123429685,
|
67 |
-
"acc_norm": 0.7578890097932536,
|
68 |
-
"acc_norm_stderr": 0.009994371269104397
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b12bc4/evaluation/4b284b12bc4_4_lm-eval_global_step80108_2023-01-30-11-26-32_4shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.337,
|
5 |
-
"acc_stderr": 0.014955087918653603
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.349,
|
9 |
-
"acc_stderr": 0.015080663991563102
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.36666666666666664,
|
13 |
-
"acc_stderr": 0.013916893275819938
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.44642857142857145,
|
17 |
-
"acc_stderr": 0.067031892279424,
|
18 |
-
"f1": 0.3176100628930817
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.8,
|
22 |
-
"acc_stderr": 0.040201512610368445
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4722166899024099,
|
26 |
-
"acc_stderr": 0.004982072108448081,
|
27 |
-
"acc_norm": 0.6184027086237801,
|
28 |
-
"acc_norm_stderr": 0.004847857546957481
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5379061371841155,
|
32 |
-
"acc_stderr": 0.03000984891252911
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.56353591160221,
|
36 |
-
"acc_stderr": 0.013938569465677023
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7194013896312133,
|
40 |
-
"acc_stderr": 0.010389809647288821
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5636085626911315,
|
44 |
-
"acc_stderr": 0.008674000467432068
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6039562289562289,
|
48 |
-
"acc_stderr": 0.010035580962097942,
|
49 |
-
"acc_norm": 0.5702861952861953,
|
50 |
-
"acc_norm_stderr": 0.010157908005763674
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2790102389078498,
|
54 |
-
"acc_stderr": 0.013106784883601346,
|
55 |
-
"acc_norm": 0.3165529010238908,
|
56 |
-
"acc_norm_stderr": 0.013592431519068077
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.842,
|
60 |
-
"acc_stderr": 0.011539894677559568,
|
61 |
-
"acc_norm": 0.789,
|
62 |
-
"acc_norm_stderr": 0.012909130321042092
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7431991294885746,
|
66 |
-
"acc_stderr": 0.010192864802278045,
|
67 |
-
"acc_norm": 0.7568008705114254,
|
68 |
-
"acc_norm_stderr": 0.010009611953858915
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b12bc4/evaluation/4b284b12bc4_5_lm-eval_global_step80108_2023-01-30-11-26-32_5shots_backup.json
DELETED
@@ -1,66 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.332,
|
5 |
-
"acc_stderr": 0.014899597242811487
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.329,
|
9 |
-
"acc_stderr": 0.014865395385928357
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3541666666666667,
|
13 |
-
"acc_stderr": 0.013811933499570954
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.5535714285714286,
|
17 |
-
"acc_stderr": 0.06703189227942395,
|
18 |
-
"f1": 0.38376730002345766
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.81,
|
22 |
-
"acc_stderr": 0.03942772444036623
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.47400916152160927,
|
26 |
-
"acc_stderr": 0.004983035420235716,
|
27 |
-
"acc_norm": 0.619896434973113,
|
28 |
-
"acc_norm_stderr": 0.004844199910173026
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.516245487364621,
|
32 |
-
"acc_stderr": 0.030080573208738064
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5722178374112076,
|
36 |
-
"acc_stderr": 0.013905134013839944
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7177979690005345,
|
40 |
-
"acc_stderr": 0.010407834479647675
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5648318042813456,
|
44 |
-
"acc_stderr": 0.008671229580582118
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5997474747474747,
|
48 |
-
"acc_stderr": 0.010053550119896127,
|
49 |
-
"acc_norm": 0.569023569023569,
|
50 |
-
"acc_norm_stderr": 0.010161552863493746
|
51 |
-
}
|
52 |
-
},
|
53 |
-
"versions": {
|
54 |
-
"anli_r1": 0,
|
55 |
-
"anli_r2": 0,
|
56 |
-
"anli_r3": 0,
|
57 |
-
"cb": 1,
|
58 |
-
"copa": 0,
|
59 |
-
"hellaswag": 0,
|
60 |
-
"rte": 0,
|
61 |
-
"winogrande": 0,
|
62 |
-
"storycloze_2016": 0,
|
63 |
-
"boolq": 1,
|
64 |
-
"arc_easy": 0
|
65 |
-
}
|
66 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b12bc4/evaluation/{4b284b12bc4_0.json β rankeval/4b284b12bc4_0.json}
RENAMED
File without changes
|
4b284b12bc4/evaluation/{4b284b12bc4_1.json β rankeval/4b284b12bc4_1.json}
RENAMED
File without changes
|
4b284b12bc4/evaluation/{4b284b12bc4_2.json β rankeval/4b284b12bc4_2.json}
RENAMED
File without changes
|
4b284b12bc4/evaluation/{4b284b12bc4_3.json β rankeval/4b284b12bc4_3.json}
RENAMED
File without changes
|
4b284b12bc4/evaluation/{4b284b12bc4_4.json β rankeval/4b284b12bc4_4.json}
RENAMED
File without changes
|
4b284b12bc4/evaluation/{4b284b12bc4_5.json β rankeval/4b284b12bc4_5.json}
RENAMED
@@ -48,6 +48,24 @@
|
|
48 |
"acc_stderr": 0.010053550119896127,
|
49 |
"acc_norm": 0.569023569023569,
|
50 |
"acc_norm_stderr": 0.010161552863493746
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
}
|
52 |
},
|
53 |
"versions": {
|
@@ -61,6 +79,9 @@
|
|
61 |
"winogrande": 0,
|
62 |
"storycloze_2016": 0,
|
63 |
"boolq": 1,
|
64 |
-
"arc_easy": 0
|
|
|
|
|
|
|
65 |
}
|
66 |
}
|
|
|
48 |
"acc_stderr": 0.010053550119896127,
|
49 |
"acc_norm": 0.569023569023569,
|
50 |
"acc_norm_stderr": 0.010161552863493746
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.27559726962457337,
|
54 |
+
"acc_stderr": 0.01305716965576184,
|
55 |
+
"acc_norm": 0.31569965870307165,
|
56 |
+
"acc_norm_stderr": 0.013582571095815291
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.844,
|
60 |
+
"acc_stderr": 0.01148023500612236,
|
61 |
+
"acc_norm": 0.794,
|
62 |
+
"acc_norm_stderr": 0.012795613612786551
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.7399347116430903,
|
66 |
+
"acc_stderr": 0.0102348932490613,
|
67 |
+
"acc_norm": 0.7595212187159956,
|
68 |
+
"acc_norm_stderr": 0.009971345364651064
|
69 |
}
|
70 |
},
|
71 |
"versions": {
|
|
|
79 |
"winogrande": 0,
|
80 |
"storycloze_2016": 0,
|
81 |
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
}
|
87 |
}
|
4b284b17bc4/evaluation/4b284b17bc4_0_lm-eval_global_step80108_2023-01-30-11-26-40_0shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.332,
|
5 |
-
"acc_stderr": 0.014899597242811478
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.329,
|
9 |
-
"acc_stderr": 0.014865395385928362
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.34833333333333333,
|
13 |
-
"acc_stderr": 0.013759437498874075
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.5714285714285714,
|
17 |
-
"acc_stderr": 0.06672848092813058,
|
18 |
-
"f1": 0.3888888888888889
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.76,
|
22 |
-
"acc_stderr": 0.04292346959909283
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.469627564230233,
|
26 |
-
"acc_stderr": 0.004980566907790459,
|
27 |
-
"acc_norm": 0.6134236207926708,
|
28 |
-
"acc_norm_stderr": 0.004859699562451462
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5415162454873647,
|
32 |
-
"acc_stderr": 0.029992535385373314
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5737963693764798,
|
36 |
-
"acc_stderr": 0.013898585965412338
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7108498129342598,
|
40 |
-
"acc_stderr": 0.010484068799942072
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5623853211009174,
|
44 |
-
"acc_stderr": 0.008676717715731632
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6052188552188552,
|
48 |
-
"acc_stderr": 0.010030038935883584,
|
49 |
-
"acc_norm": 0.5429292929292929,
|
50 |
-
"acc_norm_stderr": 0.01022189756425604
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.26791808873720135,
|
54 |
-
"acc_stderr": 0.012942030195136437,
|
55 |
-
"acc_norm": 0.2883959044368601,
|
56 |
-
"acc_norm_stderr": 0.013238394422428171
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.852,
|
60 |
-
"acc_stderr": 0.011234866364235235,
|
61 |
-
"acc_norm": 0.764,
|
62 |
-
"acc_norm_stderr": 0.013434451402438678
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7578890097932536,
|
66 |
-
"acc_stderr": 0.00999437126910438,
|
67 |
-
"acc_norm": 0.7622415669205659,
|
68 |
-
"acc_norm_stderr": 0.009932525779525492
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b17bc4/evaluation/4b284b17bc4_1_lm-eval_global_step80108_2023-01-30-11-26-39_1shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.32,
|
5 |
-
"acc_stderr": 0.014758652303574886
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.324,
|
9 |
-
"acc_stderr": 0.014806864733738854
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3491666666666667,
|
13 |
-
"acc_stderr": 0.01376707539507725
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.5535714285714286,
|
17 |
-
"acc_stderr": 0.06703189227942397,
|
18 |
-
"f1": 0.3890671420083185
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.75,
|
22 |
-
"acc_stderr": 0.04351941398892446
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4640509858593906,
|
26 |
-
"acc_stderr": 0.0049768677965835555,
|
27 |
-
"acc_norm": 0.6082453694483171,
|
28 |
-
"acc_norm_stderr": 0.004871447106554927
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5451263537906137,
|
32 |
-
"acc_stderr": 0.029973636495415252
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.574585635359116,
|
36 |
-
"acc_stderr": 0.013895257666646378
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.711918760021379,
|
40 |
-
"acc_stderr": 0.010472537019822582
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5409785932721712,
|
44 |
-
"acc_stderr": 0.008715635308774412
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6342592592592593,
|
48 |
-
"acc_stderr": 0.009882988069418829,
|
49 |
-
"acc_norm": 0.5837542087542088,
|
50 |
-
"acc_norm_stderr": 0.01011481940450087
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2901023890784983,
|
54 |
-
"acc_stderr": 0.013261573677520764,
|
55 |
-
"acc_norm": 0.30119453924914674,
|
56 |
-
"acc_norm_stderr": 0.013406741767847638
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.896,
|
60 |
-
"acc_stderr": 0.009658016218524301,
|
61 |
-
"acc_norm": 0.88,
|
62 |
-
"acc_norm_stderr": 0.010281328012747386
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7551686615886833,
|
66 |
-
"acc_stderr": 0.010032309105568793,
|
67 |
-
"acc_norm": 0.766050054406964,
|
68 |
-
"acc_norm_stderr": 0.009877236895137436
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b17bc4/evaluation/4b284b17bc4_2_lm-eval_global_step80108_2023-01-30-11-26-39_2shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.343,
|
5 |
-
"acc_stderr": 0.015019206922356953
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.318,
|
9 |
-
"acc_stderr": 0.014734079309311901
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.325,
|
13 |
-
"acc_stderr": 0.013526454480351028
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.42857142857142855,
|
17 |
-
"acc_stderr": 0.06672848092813058,
|
18 |
-
"f1": 0.3058470764617691
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.78,
|
22 |
-
"acc_stderr": 0.04163331998932263
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.45727942640908187,
|
26 |
-
"acc_stderr": 0.004971534874389935,
|
27 |
-
"acc_norm": 0.602867954590719,
|
28 |
-
"acc_norm_stderr": 0.004883037758919964
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.48736462093862815,
|
32 |
-
"acc_stderr": 0.030086851767188564
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5808997632202052,
|
36 |
-
"acc_stderr": 0.013867325192210116
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7215392838054516,
|
40 |
-
"acc_stderr": 0.010365521460604415
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5489296636085627,
|
44 |
-
"acc_stderr": 0.008703080962379622
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6325757575757576,
|
48 |
-
"acc_stderr": 0.009892552616211558,
|
49 |
-
"acc_norm": 0.617003367003367,
|
50 |
-
"acc_norm_stderr": 0.009974920384536479
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2901023890784983,
|
54 |
-
"acc_stderr": 0.013261573677520759,
|
55 |
-
"acc_norm": 0.31313993174061433,
|
56 |
-
"acc_norm_stderr": 0.013552671543623496
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.906,
|
60 |
-
"acc_stderr": 0.009233052000787738,
|
61 |
-
"acc_norm": 0.891,
|
62 |
-
"acc_norm_stderr": 0.009859828407037186
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7540805223068553,
|
66 |
-
"acc_stderr": 0.010047331865625194,
|
67 |
-
"acc_norm": 0.7698585418933623,
|
68 |
-
"acc_norm_stderr": 0.009820832826839796
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b17bc4/evaluation/4b284b17bc4_3_lm-eval_global_step80108_2023-01-30-11-26-39_3shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.348,
|
5 |
-
"acc_stderr": 0.015070604603768408
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.36,
|
9 |
-
"acc_stderr": 0.01518652793204012
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.35083333333333333,
|
13 |
-
"acc_stderr": 0.013782212417178195
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.48214285714285715,
|
17 |
-
"acc_stderr": 0.0673769750864465,
|
18 |
-
"f1": 0.40387403446226977
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.79,
|
22 |
-
"acc_stderr": 0.040936018074033256
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4567815176259709,
|
26 |
-
"acc_stderr": 0.004971106265046551,
|
27 |
-
"acc_norm": 0.5992830113523202,
|
28 |
-
"acc_norm_stderr": 0.004890422457747258
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.48375451263537905,
|
32 |
-
"acc_stderr": 0.030080573208738064
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.569060773480663,
|
36 |
-
"acc_stderr": 0.013917796623335966
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7247461250668092,
|
40 |
-
"acc_stderr": 0.010328538400500567
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5498470948012233,
|
44 |
-
"acc_stderr": 0.008701488203356937
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6266835016835017,
|
48 |
-
"acc_stderr": 0.009925009142802903,
|
49 |
-
"acc_norm": 0.6203703703703703,
|
50 |
-
"acc_norm_stderr": 0.009958037725468558
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2901023890784983,
|
54 |
-
"acc_stderr": 0.013261573677520769,
|
55 |
-
"acc_norm": 0.31143344709897613,
|
56 |
-
"acc_norm_stderr": 0.013532472099850949
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.923,
|
60 |
-
"acc_stderr": 0.008434580140240632,
|
61 |
-
"acc_norm": 0.903,
|
62 |
-
"acc_norm_stderr": 0.00936368937324812
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7578890097932536,
|
66 |
-
"acc_stderr": 0.009994371269104387,
|
67 |
-
"acc_norm": 0.7682263329706203,
|
68 |
-
"acc_norm_stderr": 0.00984514377279405
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b17bc4/evaluation/4b284b17bc4_4_lm-eval_global_step80108_2023-01-30-11-26-39_4shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.36,
|
5 |
-
"acc_stderr": 0.015186527932040117
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.347,
|
9 |
-
"acc_stderr": 0.015060472031706625
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3625,
|
13 |
-
"acc_stderr": 0.01388303787422552
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.5535714285714286,
|
17 |
-
"acc_stderr": 0.06703189227942395,
|
18 |
-
"f1": 0.4538378958668814
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.79,
|
22 |
-
"acc_stderr": 0.040936018074033256
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.45180242979486157,
|
26 |
-
"acc_stderr": 0.004966544724452227,
|
27 |
-
"acc_norm": 0.5955984863572994,
|
28 |
-
"acc_norm_stderr": 0.004897728370737246
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.48375451263537905,
|
32 |
-
"acc_stderr": 0.030080573208738064
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5706393054459353,
|
36 |
-
"acc_stderr": 0.013911537499969163
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7177979690005345,
|
40 |
-
"acc_stderr": 0.010407834479647672
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.545565749235474,
|
44 |
-
"acc_stderr": 0.008708665643758015
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.640993265993266,
|
48 |
-
"acc_stderr": 0.009843424713072174,
|
49 |
-
"acc_norm": 0.6186868686868687,
|
50 |
-
"acc_norm_stderr": 0.009966542497171025
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.302901023890785,
|
54 |
-
"acc_stderr": 0.013428241573185349,
|
55 |
-
"acc_norm": 0.32337883959044367,
|
56 |
-
"acc_norm_stderr": 0.013669421630012129
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.915,
|
60 |
-
"acc_stderr": 0.008823426366942331,
|
61 |
-
"acc_norm": 0.912,
|
62 |
-
"acc_norm_stderr": 0.008963053962592085
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7578890097932536,
|
66 |
-
"acc_stderr": 0.009994371269104385,
|
67 |
-
"acc_norm": 0.7752992383025027,
|
68 |
-
"acc_norm_stderr": 0.009738282586548389
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b17bc4/evaluation/4b284b17bc4_5_lm-eval_global_step80108_2023-01-30-11-26-39_5shots_backup.json
DELETED
@@ -1,73 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.363,
|
5 |
-
"acc_stderr": 0.015213890444671281
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.347,
|
9 |
-
"acc_stderr": 0.015060472031706624
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.34,
|
13 |
-
"acc_stderr": 0.013680495725767794
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.5535714285714286,
|
17 |
-
"acc_stderr": 0.06703189227942397,
|
18 |
-
"f1": 0.3974410235905637
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.81,
|
22 |
-
"acc_stderr": 0.03942772444036623
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.44981079466241786,
|
26 |
-
"acc_stderr": 0.004964579685712439,
|
27 |
-
"acc_norm": 0.6002788289185421,
|
28 |
-
"acc_norm_stderr": 0.004888398535520516
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.49097472924187724,
|
32 |
-
"acc_stderr": 0.030091559826331334
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5785319652722968,
|
36 |
-
"acc_stderr": 0.013878072377497603
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7113842864778194,
|
40 |
-
"acc_stderr": 0.01047831178564294
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5376146788990825,
|
44 |
-
"acc_stderr": 0.008720273736433679
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6447811447811448,
|
48 |
-
"acc_stderr": 0.009820245899287117,
|
49 |
-
"acc_norm": 0.625,
|
50 |
-
"acc_norm_stderr": 0.009933992677987828
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2986348122866894,
|
54 |
-
"acc_stderr": 0.013374078615068756,
|
55 |
-
"acc_norm": 0.310580204778157,
|
56 |
-
"acc_norm_stderr": 0.013522292098053052
|
57 |
-
}
|
58 |
-
},
|
59 |
-
"versions": {
|
60 |
-
"anli_r1": 0,
|
61 |
-
"anli_r2": 0,
|
62 |
-
"anli_r3": 0,
|
63 |
-
"cb": 1,
|
64 |
-
"copa": 0,
|
65 |
-
"hellaswag": 0,
|
66 |
-
"rte": 0,
|
67 |
-
"winogrande": 0,
|
68 |
-
"storycloze_2016": 0,
|
69 |
-
"boolq": 1,
|
70 |
-
"arc_easy": 0,
|
71 |
-
"arc_challenge": 0
|
72 |
-
}
|
73 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b17bc4/evaluation/{4b284b17bc4_0.json β rankeval/4b284b17bc4_0.json}
RENAMED
File without changes
|
4b284b17bc4/evaluation/{4b284b17bc4_1.json β rankeval/4b284b17bc4_1.json}
RENAMED
File without changes
|
4b284b17bc4/evaluation/{4b284b17bc4_2.json β rankeval/4b284b17bc4_2.json}
RENAMED
File without changes
|
4b284b17bc4/evaluation/{4b284b17bc4_3.json β rankeval/4b284b17bc4_3.json}
RENAMED
File without changes
|
4b284b17bc4/evaluation/{4b284b17bc4_4.json β rankeval/4b284b17bc4_4.json}
RENAMED
File without changes
|
4b284b17bc4/evaluation/{4b284b17bc4_5.json β rankeval/4b284b17bc4_5.json}
RENAMED
@@ -54,6 +54,18 @@
|
|
54 |
"acc_stderr": 0.013374078615068756,
|
55 |
"acc_norm": 0.310580204778157,
|
56 |
"acc_norm_stderr": 0.013522292098053052
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
}
|
58 |
},
|
59 |
"versions": {
|
@@ -68,6 +80,8 @@
|
|
68 |
"storycloze_2016": 0,
|
69 |
"boolq": 1,
|
70 |
"arc_easy": 0,
|
71 |
-
"arc_challenge": 0
|
|
|
|
|
72 |
}
|
73 |
}
|
|
|
54 |
"acc_stderr": 0.013374078615068756,
|
55 |
"acc_norm": 0.310580204778157,
|
56 |
"acc_norm_stderr": 0.013522292098053052
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.918,
|
60 |
+
"acc_stderr": 0.00868051561552374,
|
61 |
+
"acc_norm": 0.908,
|
62 |
+
"acc_norm_stderr": 0.009144376393151117
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.7584330794341676,
|
66 |
+
"acc_stderr": 0.00998671800180446,
|
67 |
+
"acc_norm": 0.7671381936887922,
|
68 |
+
"acc_norm_stderr": 0.009861236071080757
|
69 |
}
|
70 |
},
|
71 |
"versions": {
|
|
|
80 |
"storycloze_2016": 0,
|
81 |
"boolq": 1,
|
82 |
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
}
|
87 |
}
|
4b284b21bc4/evaluation/4b284b21bc4_0_lm-eval_global_step80108_2023-01-30-11-26-38_0shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.332,
|
5 |
-
"acc_stderr": 0.014899597242811485
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.337,
|
9 |
-
"acc_stderr": 0.0149550879186536
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.355,
|
13 |
-
"acc_stderr": 0.013819249004047296
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.48214285714285715,
|
17 |
-
"acc_stderr": 0.0673769750864465,
|
18 |
-
"f1": 0.4347442680776014
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.76,
|
22 |
-
"acc_stderr": 0.04292346959909283
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4841665006970723,
|
26 |
-
"acc_stderr": 0.004987278910505115,
|
27 |
-
"acc_norm": 0.6352320254929297,
|
28 |
-
"acc_norm_stderr": 0.004803812631994966
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5306859205776173,
|
32 |
-
"acc_stderr": 0.03003973059219781
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5990528808208366,
|
36 |
-
"acc_stderr": 0.013773974554948033
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7151256012827365,
|
40 |
-
"acc_stderr": 0.010437513986611718
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5669724770642202,
|
44 |
-
"acc_stderr": 0.008666251305518059
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.617003367003367,
|
48 |
-
"acc_stderr": 0.009974920384536469,
|
49 |
-
"acc_norm": 0.5462962962962963,
|
50 |
-
"acc_norm_stderr": 0.010215708295494117
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.28668941979522183,
|
54 |
-
"acc_stderr": 0.013214986329274757,
|
55 |
-
"acc_norm": 0.30631399317406144,
|
56 |
-
"acc_norm_stderr": 0.013470584417276513
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.845,
|
60 |
-
"acc_stderr": 0.011450157470799475,
|
61 |
-
"acc_norm": 0.757,
|
62 |
-
"acc_norm_stderr": 0.013569640199177458
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7578890097932536,
|
66 |
-
"acc_stderr": 0.00999437126910438,
|
67 |
-
"acc_norm": 0.7676822633297062,
|
68 |
-
"acc_norm_stderr": 0.009853201384168243
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b21bc4/evaluation/4b284b21bc4_1_lm-eval_global_step80108_2023-01-30-11-26-38_1shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.336,
|
5 |
-
"acc_stderr": 0.01494414023379502
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.315,
|
9 |
-
"acc_stderr": 0.014696631960792506
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.34,
|
13 |
-
"acc_stderr": 0.0136804957257678
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.5357142857142857,
|
17 |
-
"acc_stderr": 0.06724777654937658,
|
18 |
-
"f1": 0.38181818181818183
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.74,
|
22 |
-
"acc_stderr": 0.04408440022768077
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.48137821151165106,
|
26 |
-
"acc_stderr": 0.004986319587524962,
|
27 |
-
"acc_norm": 0.6344353714399522,
|
28 |
-
"acc_norm_stderr": 0.004806039039008954
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5451263537906137,
|
32 |
-
"acc_stderr": 0.029973636495415252
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5974743488555643,
|
36 |
-
"acc_stderr": 0.013782866831703048
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7044361304115446,
|
40 |
-
"acc_stderr": 0.01055177883937378
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5669724770642202,
|
44 |
-
"acc_stderr": 0.008666251305518059
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6220538720538721,
|
48 |
-
"acc_stderr": 0.009949405744045452,
|
49 |
-
"acc_norm": 0.5787037037037037,
|
50 |
-
"acc_norm_stderr": 0.010131882498193127
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.29266211604095566,
|
54 |
-
"acc_stderr": 0.01329591610361942,
|
55 |
-
"acc_norm": 0.32849829351535836,
|
56 |
-
"acc_norm_stderr": 0.013724978465537357
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.891,
|
60 |
-
"acc_stderr": 0.00985982840703719,
|
61 |
-
"acc_norm": 0.871,
|
62 |
-
"acc_norm_stderr": 0.010605256784796579
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7551686615886833,
|
66 |
-
"acc_stderr": 0.010032309105568788,
|
67 |
-
"acc_norm": 0.764961915125136,
|
68 |
-
"acc_norm_stderr": 0.009893146688805308
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b21bc4/evaluation/4b284b21bc4_2_lm-eval_global_step80108_2023-01-30-11-26-38_2shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.327,
|
5 |
-
"acc_stderr": 0.014842213153411247
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.333,
|
9 |
-
"acc_stderr": 0.01491084616422986
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3408333333333333,
|
13 |
-
"acc_stderr": 0.01368860079329693
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.5357142857142857,
|
17 |
-
"acc_stderr": 0.06724777654937658,
|
18 |
-
"f1": 0.3829365079365079
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.78,
|
22 |
-
"acc_stderr": 0.04163331998932262
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.48048197570205137,
|
26 |
-
"acc_stderr": 0.00498597821493792,
|
27 |
-
"acc_norm": 0.6397132045409281,
|
28 |
-
"acc_norm_stderr": 0.004791024004587989
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5090252707581228,
|
32 |
-
"acc_stderr": 0.030091559826331334
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.6053670086819258,
|
36 |
-
"acc_stderr": 0.013736915172371883
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7161945483698557,
|
40 |
-
"acc_stderr": 0.01042569627973092
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5920489296636086,
|
44 |
-
"acc_stderr": 0.008595583792654892
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.622895622895623,
|
48 |
-
"acc_stderr": 0.009945041946366499,
|
49 |
-
"acc_norm": 0.6018518518518519,
|
50 |
-
"acc_norm_stderr": 0.010044662374653398
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.295221843003413,
|
54 |
-
"acc_stderr": 0.013329750293382318,
|
55 |
-
"acc_norm": 0.32337883959044367,
|
56 |
-
"acc_norm_stderr": 0.013669421630012129
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.903,
|
60 |
-
"acc_stderr": 0.009363689373248092,
|
61 |
-
"acc_norm": 0.882,
|
62 |
-
"acc_norm_stderr": 0.010206869264381791
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7578890097932536,
|
66 |
-
"acc_stderr": 0.009994371269104376,
|
67 |
-
"acc_norm": 0.7682263329706203,
|
68 |
-
"acc_norm_stderr": 0.009845143772794043
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b21bc4/evaluation/4b284b21bc4_3_lm-eval_global_step80108_2023-01-30-11-26-38_3shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.338,
|
5 |
-
"acc_stderr": 0.014965960710224496
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.345,
|
9 |
-
"acc_stderr": 0.015039986742055238
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3566666666666667,
|
13 |
-
"acc_stderr": 0.013833742805050717
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.6071428571428571,
|
17 |
-
"acc_stderr": 0.0658538889806635,
|
18 |
-
"f1": 0.5367003367003368
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.8,
|
22 |
-
"acc_stderr": 0.040201512610368445
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4826727743477395,
|
26 |
-
"acc_stderr": 0.004986784319771787,
|
27 |
-
"acc_norm": 0.6368253335988847,
|
28 |
-
"acc_norm_stderr": 0.004799317209902001
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5631768953068592,
|
32 |
-
"acc_stderr": 0.029855247390314945
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.6037884767166535,
|
36 |
-
"acc_stderr": 0.013746404157154949
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7204703367183325,
|
40 |
-
"acc_stderr": 0.01037770209970486
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5923547400611621,
|
44 |
-
"acc_stderr": 0.008594580270731619
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.627104377104377,
|
48 |
-
"acc_stderr": 0.009922743197129257,
|
49 |
-
"acc_norm": 0.609006734006734,
|
50 |
-
"acc_norm_stderr": 0.010012992232540631
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.29436860068259385,
|
54 |
-
"acc_stderr": 0.013318528460539429,
|
55 |
-
"acc_norm": 0.3319112627986348,
|
56 |
-
"acc_norm_stderr": 0.01376098820088054
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.913,
|
60 |
-
"acc_stderr": 0.0089168666307459,
|
61 |
-
"acc_norm": 0.897,
|
62 |
-
"acc_norm_stderr": 0.009616833339695798
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7589771490750816,
|
66 |
-
"acc_stderr": 0.009979042717267314,
|
67 |
-
"acc_norm": 0.7742110990206746,
|
68 |
-
"acc_norm_stderr": 0.009754980670917311
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b21bc4/evaluation/4b284b21bc4_4_lm-eval_global_step80108_2023-01-30-11-26-38_4shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.343,
|
5 |
-
"acc_stderr": 0.015019206922356951
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.346,
|
9 |
-
"acc_stderr": 0.01505026612756445
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.36083333333333334,
|
13 |
-
"acc_stderr": 0.01386918025244486
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.5535714285714286,
|
17 |
-
"acc_stderr": 0.06703189227942395,
|
18 |
-
"f1": 0.4583333333333333
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.8,
|
22 |
-
"acc_stderr": 0.040201512610368445
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.48157737502489545,
|
26 |
-
"acc_stderr": 0.0049863932662691625,
|
27 |
-
"acc_norm": 0.6417048396733719,
|
28 |
-
"acc_norm_stderr": 0.00478519504988916
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5379061371841155,
|
32 |
-
"acc_stderr": 0.030009848912529113
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.6085240726124704,
|
36 |
-
"acc_stderr": 0.01371748707129085
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7338321753073223,
|
40 |
-
"acc_stderr": 0.010220104800551206
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6119266055045871,
|
44 |
-
"acc_stderr": 0.00852313058476084
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6283670033670034,
|
48 |
-
"acc_stderr": 0.00991589712365879,
|
49 |
-
"acc_norm": 0.6153198653198653,
|
50 |
-
"acc_norm_stderr": 0.009983171707008997
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2960750853242321,
|
54 |
-
"acc_stderr": 0.013340916085246271,
|
55 |
-
"acc_norm": 0.3242320819112628,
|
56 |
-
"acc_norm_stderr": 0.013678810399518819
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.923,
|
60 |
-
"acc_stderr": 0.008434580140240648,
|
61 |
-
"acc_norm": 0.912,
|
62 |
-
"acc_norm_stderr": 0.008963053962592074
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7595212187159956,
|
66 |
-
"acc_stderr": 0.009971345364651078,
|
67 |
-
"acc_norm": 0.7676822633297062,
|
68 |
-
"acc_norm_stderr": 0.009853201384168243
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b21bc4/evaluation/4b284b21bc4_5_lm-eval_global_step80108_2023-01-30-11-26-38_5shots_backup.json
DELETED
@@ -1,66 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.351,
|
5 |
-
"acc_stderr": 0.015100563798316405
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.345,
|
9 |
-
"acc_stderr": 0.015039986742055237
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.345,
|
13 |
-
"acc_stderr": 0.013728421539454878
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.5714285714285714,
|
17 |
-
"acc_stderr": 0.06672848092813058,
|
18 |
-
"f1": 0.37671957671957673
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.78,
|
22 |
-
"acc_stderr": 0.04163331998932261
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4827723561043617,
|
26 |
-
"acc_stderr": 0.004986818680313444,
|
27 |
-
"acc_norm": 0.6446922923720374,
|
28 |
-
"acc_norm_stderr": 0.004776283203468094
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5776173285198556,
|
32 |
-
"acc_stderr": 0.02973162264649588
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.595895816890292,
|
36 |
-
"acc_stderr": 0.013791610664670845
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7252805986103688,
|
40 |
-
"acc_stderr": 0.010322309878339507
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6146788990825688,
|
44 |
-
"acc_stderr": 0.008511930879680652
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6300505050505051,
|
48 |
-
"acc_stderr": 0.009906656266021155,
|
49 |
-
"acc_norm": 0.6111111111111112,
|
50 |
-
"acc_norm_stderr": 0.01000324833531377
|
51 |
-
}
|
52 |
-
},
|
53 |
-
"versions": {
|
54 |
-
"anli_r1": 0,
|
55 |
-
"anli_r2": 0,
|
56 |
-
"anli_r3": 0,
|
57 |
-
"cb": 1,
|
58 |
-
"copa": 0,
|
59 |
-
"hellaswag": 0,
|
60 |
-
"rte": 0,
|
61 |
-
"winogrande": 0,
|
62 |
-
"storycloze_2016": 0,
|
63 |
-
"boolq": 1,
|
64 |
-
"arc_easy": 0
|
65 |
-
}
|
66 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b21bc4/evaluation/{4b284b21bc4_0.json β rankeval/4b284b21bc4_0.json}
RENAMED
File without changes
|
4b284b21bc4/evaluation/{4b284b21bc4_1.json β rankeval/4b284b21bc4_1.json}
RENAMED
File without changes
|
4b284b21bc4/evaluation/{4b284b21bc4_2.json β rankeval/4b284b21bc4_2.json}
RENAMED
File without changes
|
4b284b21bc4/evaluation/{4b284b21bc4_3.json β rankeval/4b284b21bc4_3.json}
RENAMED
File without changes
|
4b284b21bc4/evaluation/{4b284b21bc4_4.json β rankeval/4b284b21bc4_4.json}
RENAMED
File without changes
|
4b284b21bc4/evaluation/{4b284b21bc4_5.json β rankeval/4b284b21bc4_5.json}
RENAMED
@@ -48,6 +48,24 @@
|
|
48 |
"acc_stderr": 0.009906656266021155,
|
49 |
"acc_norm": 0.6111111111111112,
|
50 |
"acc_norm_stderr": 0.01000324833531377
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
}
|
52 |
},
|
53 |
"versions": {
|
@@ -61,6 +79,9 @@
|
|
61 |
"winogrande": 0,
|
62 |
"storycloze_2016": 0,
|
63 |
"boolq": 1,
|
64 |
-
"arc_easy": 0
|
|
|
|
|
|
|
65 |
}
|
66 |
}
|
|
|
48 |
"acc_stderr": 0.009906656266021155,
|
49 |
"acc_norm": 0.6111111111111112,
|
50 |
"acc_norm_stderr": 0.01000324833531377
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.30716723549488056,
|
54 |
+
"acc_stderr": 0.013481034054980945,
|
55 |
+
"acc_norm": 0.32337883959044367,
|
56 |
+
"acc_norm_stderr": 0.013669421630012122
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.919,
|
60 |
+
"acc_stderr": 0.008632121032139978,
|
61 |
+
"acc_norm": 0.907,
|
62 |
+
"acc_norm_stderr": 0.009188875634996669
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.7529923830250272,
|
66 |
+
"acc_stderr": 0.010062268140772625,
|
67 |
+
"acc_norm": 0.7671381936887922,
|
68 |
+
"acc_norm_stderr": 0.009861236071080753
|
69 |
}
|
70 |
},
|
71 |
"versions": {
|
|
|
79 |
"winogrande": 0,
|
80 |
"storycloze_2016": 0,
|
81 |
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
}
|
87 |
}
|
4b284b28bc4/evaluation/4b284b28bc4_0_lm-eval_global_step80108_2023-01-30-11-26-39_0shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.344,
|
5 |
-
"acc_stderr": 0.015029633724408947
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.321,
|
9 |
-
"acc_stderr": 0.01477082181793464
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.34833333333333333,
|
13 |
-
"acc_stderr": 0.01375943749887408
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.35714285714285715,
|
17 |
-
"acc_stderr": 0.06460957383809221,
|
18 |
-
"f1": 0.1754385964912281
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.8,
|
22 |
-
"acc_stderr": 0.040201512610368445
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4792869946225851,
|
26 |
-
"acc_stderr": 0.004985498055190357,
|
27 |
-
"acc_norm": 0.6265684126667994,
|
28 |
-
"acc_norm_stderr": 0.004827266662144035
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5342960288808665,
|
32 |
-
"acc_stderr": 0.030025579819366422
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5753749013417522,
|
36 |
-
"acc_stderr": 0.013891893150264213
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7231427044361304,
|
40 |
-
"acc_stderr": 0.01034711289027692
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5700305810397553,
|
44 |
-
"acc_stderr": 0.008658853690729254
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5984848484848485,
|
48 |
-
"acc_stderr": 0.010058790020755567,
|
49 |
-
"acc_norm": 0.5395622895622896,
|
50 |
-
"acc_norm_stderr": 0.01022761638628902
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.27986348122866894,
|
54 |
-
"acc_stderr": 0.013119040897725922,
|
55 |
-
"acc_norm": 0.31143344709897613,
|
56 |
-
"acc_norm_stderr": 0.013532472099850942
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.848,
|
60 |
-
"acc_stderr": 0.011358918303475274,
|
61 |
-
"acc_norm": 0.769,
|
62 |
-
"acc_norm_stderr": 0.013334797216936438
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7584330794341676,
|
66 |
-
"acc_stderr": 0.009986718001804467,
|
67 |
-
"acc_norm": 0.7633297062023939,
|
68 |
-
"acc_norm_stderr": 0.009916841655042809
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b28bc4/evaluation/4b284b28bc4_1_lm-eval_global_step80108_2023-01-30-11-26-39_1shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.34,
|
5 |
-
"acc_stderr": 0.014987482264363937
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.321,
|
9 |
-
"acc_stderr": 0.014770821817934644
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.34,
|
13 |
-
"acc_stderr": 0.013680495725767803
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.375,
|
17 |
-
"acc_stderr": 0.06527912098338669,
|
18 |
-
"f1": 0.32099491681373216
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.77,
|
22 |
-
"acc_stderr": 0.04229525846816506
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.48078072097191793,
|
26 |
-
"acc_stderr": 0.004986093791041653,
|
27 |
-
"acc_norm": 0.6337382991435969,
|
28 |
-
"acc_norm_stderr": 0.004807975515446487
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5740072202166066,
|
32 |
-
"acc_stderr": 0.029764956741777645
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.590370955011839,
|
36 |
-
"acc_stderr": 0.013821049109655453
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7204703367183325,
|
40 |
-
"acc_stderr": 0.01037770209970486
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5948012232415902,
|
44 |
-
"acc_stderr": 0.008586427929715515
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6262626262626263,
|
48 |
-
"acc_stderr": 0.009927267058259628,
|
49 |
-
"acc_norm": 0.5917508417508418,
|
50 |
-
"acc_norm_stderr": 0.01008556619579125
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.29266211604095566,
|
54 |
-
"acc_stderr": 0.013295916103619417,
|
55 |
-
"acc_norm": 0.32337883959044367,
|
56 |
-
"acc_norm_stderr": 0.013669421630012132
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.904,
|
60 |
-
"acc_stderr": 0.009320454434783227,
|
61 |
-
"acc_norm": 0.885,
|
62 |
-
"acc_norm_stderr": 0.01009340759490462
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7622415669205659,
|
66 |
-
"acc_stderr": 0.009932525779525489,
|
67 |
-
"acc_norm": 0.763873775843308,
|
68 |
-
"acc_norm_stderr": 0.009908965890558218
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b28bc4/evaluation/4b284b28bc4_2_lm-eval_global_step80108_2023-01-30-11-26-39_2shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.325,
|
5 |
-
"acc_stderr": 0.014818724459095526
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.325,
|
9 |
-
"acc_stderr": 0.014818724459095526
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3233333333333333,
|
13 |
-
"acc_stderr": 0.013508372867300217
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.25,
|
17 |
-
"acc_stderr": 0.058387420812114225,
|
18 |
-
"f1": 0.22987012987012986
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.79,
|
22 |
-
"acc_stderr": 0.040936018074033256
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.47988448516231824,
|
26 |
-
"acc_stderr": 0.004985741706385727,
|
27 |
-
"acc_norm": 0.6363274248157738,
|
28 |
-
"acc_norm_stderr": 0.004800728138792371
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5631768953068592,
|
32 |
-
"acc_stderr": 0.02985524739031495
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5824782951854776,
|
36 |
-
"acc_stderr": 0.013859978264440248
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7177979690005345,
|
40 |
-
"acc_stderr": 0.010407834479647673
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.627217125382263,
|
44 |
-
"acc_stderr": 0.008457255867914694
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6308922558922558,
|
48 |
-
"acc_stderr": 0.009901987410242742,
|
49 |
-
"acc_norm": 0.6123737373737373,
|
50 |
-
"acc_norm_stderr": 0.009997307914447612
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.30204778156996587,
|
54 |
-
"acc_stderr": 0.01341751914471642,
|
55 |
-
"acc_norm": 0.3216723549488055,
|
56 |
-
"acc_norm_stderr": 0.013650488084494162
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.914,
|
60 |
-
"acc_stderr": 0.008870325962594766,
|
61 |
-
"acc_norm": 0.883,
|
62 |
-
"acc_norm_stderr": 0.010169287802713329
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7606093579978237,
|
66 |
-
"acc_stderr": 0.009955884250291681,
|
67 |
-
"acc_norm": 0.76550598476605,
|
68 |
-
"acc_norm_stderr": 0.009885203143240543
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b28bc4/evaluation/4b284b28bc4_3_lm-eval_global_step80108_2023-01-30-11-26-39_3shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.325,
|
5 |
-
"acc_stderr": 0.014818724459095524
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.336,
|
9 |
-
"acc_stderr": 0.014944140233795021
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3233333333333333,
|
13 |
-
"acc_stderr": 0.013508372867300212
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.39285714285714285,
|
17 |
-
"acc_stderr": 0.0658538889806635,
|
18 |
-
"f1": 0.3565868967138097
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.81,
|
22 |
-
"acc_stderr": 0.03942772444036623
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4790878311093408,
|
26 |
-
"acc_stderr": 0.004985415250690914,
|
27 |
-
"acc_norm": 0.634833698466441,
|
28 |
-
"acc_norm_stderr": 0.004804927608773137
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.6064981949458483,
|
32 |
-
"acc_stderr": 0.029405839314203194
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.585635359116022,
|
36 |
-
"acc_stderr": 0.013844846232268563
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7295563869588455,
|
40 |
-
"acc_stderr": 0.010271810373331027
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6241590214067279,
|
44 |
-
"acc_stderr": 0.008471147248160107
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6372053872053872,
|
48 |
-
"acc_stderr": 0.009865936757013942,
|
49 |
-
"acc_norm": 0.6186868686868687,
|
50 |
-
"acc_norm_stderr": 0.009966542497171021
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.30119453924914674,
|
54 |
-
"acc_stderr": 0.013406741767847624,
|
55 |
-
"acc_norm": 0.32337883959044367,
|
56 |
-
"acc_norm_stderr": 0.01366942163001213
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.91,
|
60 |
-
"acc_stderr": 0.00905439020486644,
|
61 |
-
"acc_norm": 0.897,
|
62 |
-
"acc_norm_stderr": 0.009616833339695796
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7540805223068553,
|
66 |
-
"acc_stderr": 0.01004733186562519,
|
67 |
-
"acc_norm": 0.7687704026115343,
|
68 |
-
"acc_norm_stderr": 0.009837063180625334
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b28bc4/evaluation/4b284b28bc4_4_lm-eval_global_step80108_2023-01-30-11-26-39_4shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.345,
|
5 |
-
"acc_stderr": 0.015039986742055235
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.325,
|
9 |
-
"acc_stderr": 0.014818724459095526
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.31416666666666665,
|
13 |
-
"acc_stderr": 0.013405399314984096
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.39285714285714285,
|
17 |
-
"acc_stderr": 0.0658538889806635,
|
18 |
-
"f1": 0.3647495361781076
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.82,
|
22 |
-
"acc_stderr": 0.038612291966536955
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4819757020513842,
|
26 |
-
"acc_stderr": 0.004986538243846636,
|
27 |
-
"acc_norm": 0.6387173869747063,
|
28 |
-
"acc_norm_stderr": 0.004793904922401888
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.48736462093862815,
|
32 |
-
"acc_stderr": 0.030086851767188564
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5832675611681136,
|
36 |
-
"acc_stderr": 0.013856250072796322
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7386424371993586,
|
40 |
-
"acc_stderr": 0.010160471460690485
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6275229357798165,
|
44 |
-
"acc_stderr": 0.008455846866956085
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6405723905723906,
|
48 |
-
"acc_stderr": 0.009845958893373766,
|
49 |
-
"acc_norm": 0.6212121212121212,
|
50 |
-
"acc_norm_stderr": 0.00995373765654204
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.30204778156996587,
|
54 |
-
"acc_stderr": 0.01341751914471642,
|
55 |
-
"acc_norm": 0.32764505119453924,
|
56 |
-
"acc_norm_stderr": 0.013715847940719344
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.92,
|
60 |
-
"acc_stderr": 0.008583336977753653,
|
61 |
-
"acc_norm": 0.907,
|
62 |
-
"acc_norm_stderr": 0.009188875634996702
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7551686615886833,
|
66 |
-
"acc_stderr": 0.01003230910556879,
|
67 |
-
"acc_norm": 0.76550598476605,
|
68 |
-
"acc_norm_stderr": 0.00988520314324054
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b28bc4/evaluation/4b284b28bc4_5_lm-eval_global_step80108_2023-01-30-11-26-39_5shots_backup.json
DELETED
@@ -1,59 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.332,
|
5 |
-
"acc_stderr": 0.014899597242811475
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.316,
|
9 |
-
"acc_stderr": 0.014709193056057106
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.31666666666666665,
|
13 |
-
"acc_stderr": 0.013434078660827384
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.30357142857142855,
|
17 |
-
"acc_stderr": 0.06199938655510754,
|
18 |
-
"f1": 0.2503507986266607
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.8,
|
22 |
-
"acc_stderr": 0.040201512610368445
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4788886675960964,
|
26 |
-
"acc_stderr": 0.004985331652408345,
|
27 |
-
"acc_norm": 0.6412069308902609,
|
28 |
-
"acc_norm_stderr": 0.004786660691181937
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5740072202166066,
|
32 |
-
"acc_stderr": 0.02976495674177765
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5911602209944752,
|
36 |
-
"acc_stderr": 0.013816954295135684
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7279529663281668,
|
40 |
-
"acc_stderr": 0.010290888060871242
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.6275229357798165,
|
44 |
-
"acc_stderr": 0.008455846866956086
|
45 |
-
}
|
46 |
-
},
|
47 |
-
"versions": {
|
48 |
-
"anli_r1": 0,
|
49 |
-
"anli_r2": 0,
|
50 |
-
"anli_r3": 0,
|
51 |
-
"cb": 1,
|
52 |
-
"copa": 0,
|
53 |
-
"hellaswag": 0,
|
54 |
-
"rte": 0,
|
55 |
-
"winogrande": 0,
|
56 |
-
"storycloze_2016": 0,
|
57 |
-
"boolq": 1
|
58 |
-
}
|
59 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b28bc4/evaluation/{4b284b28bc4_0.json β rankeval/4b284b28bc4_0.json}
RENAMED
File without changes
|
4b284b28bc4/evaluation/{4b284b28bc4_1.json β rankeval/4b284b28bc4_1.json}
RENAMED
File without changes
|
4b284b28bc4/evaluation/{4b284b28bc4_2.json β rankeval/4b284b28bc4_2.json}
RENAMED
File without changes
|
4b284b28bc4/evaluation/{4b284b28bc4_3.json β rankeval/4b284b28bc4_3.json}
RENAMED
File without changes
|
4b284b28bc4/evaluation/{4b284b28bc4_4.json β rankeval/4b284b28bc4_4.json}
RENAMED
File without changes
|
4b284b28bc4/evaluation/{4b284b28bc4_5.json β rankeval/4b284b28bc4_5.json}
RENAMED
@@ -42,6 +42,30 @@
|
|
42 |
"boolq": {
|
43 |
"acc": 0.6275229357798165,
|
44 |
"acc_stderr": 0.008455846866956086
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
}
|
46 |
},
|
47 |
"versions": {
|
@@ -54,6 +78,10 @@
|
|
54 |
"rte": 0,
|
55 |
"winogrande": 0,
|
56 |
"storycloze_2016": 0,
|
57 |
-
"boolq": 1
|
|
|
|
|
|
|
|
|
58 |
}
|
59 |
}
|
|
|
42 |
"boolq": {
|
43 |
"acc": 0.6275229357798165,
|
44 |
"acc_stderr": 0.008455846866956086
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.6401515151515151,
|
48 |
+
"acc_stderr": 0.009848484848484846,
|
49 |
+
"acc_norm": 0.6296296296296297,
|
50 |
+
"acc_norm_stderr": 0.009908978578665755
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.30887372013651876,
|
54 |
+
"acc_stderr": 0.013501770929344003,
|
55 |
+
"acc_norm": 0.32849829351535836,
|
56 |
+
"acc_norm_stderr": 0.013724978465537377
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.921,
|
60 |
+
"acc_stderr": 0.008534156773333445,
|
61 |
+
"acc_norm": 0.908,
|
62 |
+
"acc_norm_stderr": 0.00914437639315112
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.750272034820457,
|
66 |
+
"acc_stderr": 0.010099232969867486,
|
67 |
+
"acc_norm": 0.764961915125136,
|
68 |
+
"acc_norm_stderr": 0.009893146688805312
|
69 |
}
|
70 |
},
|
71 |
"versions": {
|
|
|
78 |
"rte": 0,
|
79 |
"winogrande": 0,
|
80 |
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
}
|
87 |
}
|
4b284b42bc4/evaluation/4b284b42bc4_0_lm-eval_global_step80108_2023-01-30-11-26-38_0shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.331,
|
5 |
-
"acc_stderr": 0.014888272588203931
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.342,
|
9 |
-
"acc_stderr": 0.01500870618212173
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.34,
|
13 |
-
"acc_stderr": 0.013680495725767784
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.5357142857142857,
|
17 |
-
"acc_stderr": 0.06724777654937658,
|
18 |
-
"f1": 0.45393112410656267
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.75,
|
22 |
-
"acc_stderr": 0.04351941398892446
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4833698466440948,
|
26 |
-
"acc_stderr": 0.004987020679861267,
|
27 |
-
"acc_norm": 0.63433578968333,
|
28 |
-
"acc_norm_stderr": 0.004806316342709393
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5776173285198556,
|
32 |
-
"acc_stderr": 0.029731622646495887
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5864246250986582,
|
36 |
-
"acc_stderr": 0.013840971763195303
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7204703367183325,
|
40 |
-
"acc_stderr": 0.01037770209970486
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5253822629969419,
|
44 |
-
"acc_stderr": 0.0087337795418535
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6224747474747475,
|
48 |
-
"acc_stderr": 0.00994722783346943,
|
49 |
-
"acc_norm": 0.5462962962962963,
|
50 |
-
"acc_norm_stderr": 0.010215708295494117
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.27986348122866894,
|
54 |
-
"acc_stderr": 0.013119040897725922,
|
55 |
-
"acc_norm": 0.29266211604095566,
|
56 |
-
"acc_norm_stderr": 0.01329591610361942
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.837,
|
60 |
-
"acc_stderr": 0.011686212712746849,
|
61 |
-
"acc_norm": 0.757,
|
62 |
-
"acc_norm_stderr": 0.013569640199177458
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7448313384113167,
|
66 |
-
"acc_stderr": 0.010171571592521822,
|
67 |
-
"acc_norm": 0.76550598476605,
|
68 |
-
"acc_norm_stderr": 0.00988520314324054
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4b284b42bc4/evaluation/4b284b42bc4_1_lm-eval_global_step80108_2023-01-30-11-26-38_1shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.31,
|
5 |
-
"acc_stderr": 0.014632638658632902
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.31,
|
9 |
-
"acc_stderr": 0.014632638658632905
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3283333333333333,
|
13 |
-
"acc_stderr": 0.013562032919529017
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.3392857142857143,
|
17 |
-
"acc_stderr": 0.06384226561930825,
|
18 |
-
"f1": 0.29749748849204566
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.79,
|
22 |
-
"acc_stderr": 0.040936018074033256
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4803823939454292,
|
26 |
-
"acc_stderr": 0.004985939292819582,
|
27 |
-
"acc_norm": 0.6294562836088429,
|
28 |
-
"acc_norm_stderr": 0.004819633668832538
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.44765342960288806,
|
32 |
-
"acc_stderr": 0.02993107036293953
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5887924230465666,
|
36 |
-
"acc_stderr": 0.013829128358676874
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.7049706039551042,
|
40 |
-
"acc_stderr": 0.010546232606962289
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5522935779816514,
|
44 |
-
"acc_stderr": 0.008697094687974059
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.6262626262626263,
|
48 |
-
"acc_stderr": 0.009927267058259621,
|
49 |
-
"acc_norm": 0.5934343434343434,
|
50 |
-
"acc_norm_stderr": 0.010079056419223527
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2883959044368601,
|
54 |
-
"acc_stderr": 0.013238394422428173,
|
55 |
-
"acc_norm": 0.3148464163822526,
|
56 |
-
"acc_norm_stderr": 0.01357265770308495
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.892,
|
60 |
-
"acc_stderr": 0.0098200016513457,
|
61 |
-
"acc_norm": 0.869,
|
62 |
-
"acc_norm_stderr": 0.010674874844837954
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7486398258977149,
|
66 |
-
"acc_stderr": 0.010121156016819259,
|
67 |
-
"acc_norm": 0.7633297062023939,
|
68 |
-
"acc_norm_stderr": 0.009916841655042809
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|