Muennighoff commited on
Commit
b851397
Β·
1 Parent(s): a1243c9
This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. 4b284b12bc4/evaluation/4b284b12bc4_0_lm-eval_global_step80108_2023-01-30-11-23-34_0shots_backup.json +0 -87
  2. 4b284b12bc4/evaluation/4b284b12bc4_1_lm-eval_global_step80108_2023-01-30-11-26-32_1shots_backup.json +0 -87
  3. 4b284b12bc4/evaluation/4b284b12bc4_2_lm-eval_global_step80108_2023-01-30-11-26-32_2shots_backup.json +0 -87
  4. 4b284b12bc4/evaluation/4b284b12bc4_3_lm-eval_global_step80108_2023-01-30-11-26-31_3shots_backup.json +0 -87
  5. 4b284b12bc4/evaluation/4b284b12bc4_4_lm-eval_global_step80108_2023-01-30-11-26-32_4shots_backup.json +0 -87
  6. 4b284b12bc4/evaluation/4b284b12bc4_5_lm-eval_global_step80108_2023-01-30-11-26-32_5shots_backup.json +0 -66
  7. 4b284b12bc4/evaluation/{4b284b12bc4_0.json β†’ rankeval/4b284b12bc4_0.json} +0 -0
  8. 4b284b12bc4/evaluation/{4b284b12bc4_1.json β†’ rankeval/4b284b12bc4_1.json} +0 -0
  9. 4b284b12bc4/evaluation/{4b284b12bc4_2.json β†’ rankeval/4b284b12bc4_2.json} +0 -0
  10. 4b284b12bc4/evaluation/{4b284b12bc4_3.json β†’ rankeval/4b284b12bc4_3.json} +0 -0
  11. 4b284b12bc4/evaluation/{4b284b12bc4_4.json β†’ rankeval/4b284b12bc4_4.json} +0 -0
  12. 4b284b12bc4/evaluation/{4b284b12bc4_5.json β†’ rankeval/4b284b12bc4_5.json} +22 -1
  13. 4b284b17bc4/evaluation/4b284b17bc4_0_lm-eval_global_step80108_2023-01-30-11-26-40_0shots_backup.json +0 -87
  14. 4b284b17bc4/evaluation/4b284b17bc4_1_lm-eval_global_step80108_2023-01-30-11-26-39_1shots_backup.json +0 -87
  15. 4b284b17bc4/evaluation/4b284b17bc4_2_lm-eval_global_step80108_2023-01-30-11-26-39_2shots_backup.json +0 -87
  16. 4b284b17bc4/evaluation/4b284b17bc4_3_lm-eval_global_step80108_2023-01-30-11-26-39_3shots_backup.json +0 -87
  17. 4b284b17bc4/evaluation/4b284b17bc4_4_lm-eval_global_step80108_2023-01-30-11-26-39_4shots_backup.json +0 -87
  18. 4b284b17bc4/evaluation/4b284b17bc4_5_lm-eval_global_step80108_2023-01-30-11-26-39_5shots_backup.json +0 -73
  19. 4b284b17bc4/evaluation/{4b284b17bc4_0.json β†’ rankeval/4b284b17bc4_0.json} +0 -0
  20. 4b284b17bc4/evaluation/{4b284b17bc4_1.json β†’ rankeval/4b284b17bc4_1.json} +0 -0
  21. 4b284b17bc4/evaluation/{4b284b17bc4_2.json β†’ rankeval/4b284b17bc4_2.json} +0 -0
  22. 4b284b17bc4/evaluation/{4b284b17bc4_3.json β†’ rankeval/4b284b17bc4_3.json} +0 -0
  23. 4b284b17bc4/evaluation/{4b284b17bc4_4.json β†’ rankeval/4b284b17bc4_4.json} +0 -0
  24. 4b284b17bc4/evaluation/{4b284b17bc4_5.json β†’ rankeval/4b284b17bc4_5.json} +15 -1
  25. 4b284b21bc4/evaluation/4b284b21bc4_0_lm-eval_global_step80108_2023-01-30-11-26-38_0shots_backup.json +0 -87
  26. 4b284b21bc4/evaluation/4b284b21bc4_1_lm-eval_global_step80108_2023-01-30-11-26-38_1shots_backup.json +0 -87
  27. 4b284b21bc4/evaluation/4b284b21bc4_2_lm-eval_global_step80108_2023-01-30-11-26-38_2shots_backup.json +0 -87
  28. 4b284b21bc4/evaluation/4b284b21bc4_3_lm-eval_global_step80108_2023-01-30-11-26-38_3shots_backup.json +0 -87
  29. 4b284b21bc4/evaluation/4b284b21bc4_4_lm-eval_global_step80108_2023-01-30-11-26-38_4shots_backup.json +0 -87
  30. 4b284b21bc4/evaluation/4b284b21bc4_5_lm-eval_global_step80108_2023-01-30-11-26-38_5shots_backup.json +0 -66
  31. 4b284b21bc4/evaluation/{4b284b21bc4_0.json β†’ rankeval/4b284b21bc4_0.json} +0 -0
  32. 4b284b21bc4/evaluation/{4b284b21bc4_1.json β†’ rankeval/4b284b21bc4_1.json} +0 -0
  33. 4b284b21bc4/evaluation/{4b284b21bc4_2.json β†’ rankeval/4b284b21bc4_2.json} +0 -0
  34. 4b284b21bc4/evaluation/{4b284b21bc4_3.json β†’ rankeval/4b284b21bc4_3.json} +0 -0
  35. 4b284b21bc4/evaluation/{4b284b21bc4_4.json β†’ rankeval/4b284b21bc4_4.json} +0 -0
  36. 4b284b21bc4/evaluation/{4b284b21bc4_5.json β†’ rankeval/4b284b21bc4_5.json} +22 -1
  37. 4b284b28bc4/evaluation/4b284b28bc4_0_lm-eval_global_step80108_2023-01-30-11-26-39_0shots_backup.json +0 -87
  38. 4b284b28bc4/evaluation/4b284b28bc4_1_lm-eval_global_step80108_2023-01-30-11-26-39_1shots_backup.json +0 -87
  39. 4b284b28bc4/evaluation/4b284b28bc4_2_lm-eval_global_step80108_2023-01-30-11-26-39_2shots_backup.json +0 -87
  40. 4b284b28bc4/evaluation/4b284b28bc4_3_lm-eval_global_step80108_2023-01-30-11-26-39_3shots_backup.json +0 -87
  41. 4b284b28bc4/evaluation/4b284b28bc4_4_lm-eval_global_step80108_2023-01-30-11-26-39_4shots_backup.json +0 -87
  42. 4b284b28bc4/evaluation/4b284b28bc4_5_lm-eval_global_step80108_2023-01-30-11-26-39_5shots_backup.json +0 -59
  43. 4b284b28bc4/evaluation/{4b284b28bc4_0.json β†’ rankeval/4b284b28bc4_0.json} +0 -0
  44. 4b284b28bc4/evaluation/{4b284b28bc4_1.json β†’ rankeval/4b284b28bc4_1.json} +0 -0
  45. 4b284b28bc4/evaluation/{4b284b28bc4_2.json β†’ rankeval/4b284b28bc4_2.json} +0 -0
  46. 4b284b28bc4/evaluation/{4b284b28bc4_3.json β†’ rankeval/4b284b28bc4_3.json} +0 -0
  47. 4b284b28bc4/evaluation/{4b284b28bc4_4.json β†’ rankeval/4b284b28bc4_4.json} +0 -0
  48. 4b284b28bc4/evaluation/{4b284b28bc4_5.json β†’ rankeval/4b284b28bc4_5.json} +29 -1
  49. 4b284b42bc4/evaluation/4b284b42bc4_0_lm-eval_global_step80108_2023-01-30-11-26-38_0shots_backup.json +0 -87
  50. 4b284b42bc4/evaluation/4b284b42bc4_1_lm-eval_global_step80108_2023-01-30-11-26-38_1shots_backup.json +0 -87
4b284b12bc4/evaluation/4b284b12bc4_0_lm-eval_global_step80108_2023-01-30-11-23-34_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.335,
5
- "acc_stderr": 0.014933117490932575
6
- },
7
- "anli_r2": {
8
- "acc": 0.334,
9
- "acc_stderr": 0.014922019523732961
10
- },
11
- "anli_r3": {
12
- "acc": 0.3491666666666667,
13
- "acc_stderr": 0.013767075395077249
14
- },
15
- "cb": {
16
- "acc": 0.39285714285714285,
17
- "acc_stderr": 0.0658538889806635,
18
- "f1": 0.23306878306878312
19
- },
20
- "copa": {
21
- "acc": 0.77,
22
- "acc_stderr": 0.04229525846816506
23
- },
24
- "hellaswag": {
25
- "acc": 0.4695279824736108,
26
- "acc_stderr": 0.0049805063294075845,
27
- "acc_norm": 0.6132244572794264,
28
- "acc_norm_stderr": 0.004860162076330956
29
- },
30
- "rte": {
31
- "acc": 0.5812274368231047,
32
- "acc_stderr": 0.02969666108123484
33
- },
34
- "winogrande": {
35
- "acc": 0.5753749013417522,
36
- "acc_stderr": 0.013891893150264218
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.711918760021379,
40
- "acc_stderr": 0.010472537019822578
41
- },
42
- "boolq": {
43
- "acc": 0.5464831804281346,
44
- "acc_stderr": 0.008707182331111644
45
- },
46
- "arc_easy": {
47
- "acc": 0.5538720538720538,
48
- "acc_stderr": 0.01020005782876501,
49
- "acc_norm": 0.4936868686868687,
50
- "acc_norm_stderr": 0.01025896566804443
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2636518771331058,
54
- "acc_stderr": 0.012875929151297049,
55
- "acc_norm": 0.2883959044368601,
56
- "acc_norm_stderr": 0.013238394422428175
57
- },
58
- "sciq": {
59
- "acc": 0.82,
60
- "acc_stderr": 0.012155153135511965,
61
- "acc_norm": 0.749,
62
- "acc_norm_stderr": 0.013718133516888921
63
- },
64
- "piqa": {
65
- "acc": 0.73449401523395,
66
- "acc_stderr": 0.010303308653024429,
67
- "acc_norm": 0.7475516866158868,
68
- "acc_norm_stderr": 0.010135665547362354
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b12bc4/evaluation/4b284b12bc4_1_lm-eval_global_step80108_2023-01-30-11-26-32_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.333,
5
- "acc_stderr": 0.014910846164229868
6
- },
7
- "anli_r2": {
8
- "acc": 0.326,
9
- "acc_stderr": 0.01483050720454104
10
- },
11
- "anli_r3": {
12
- "acc": 0.3475,
13
- "acc_stderr": 0.013751753243291852
14
- },
15
- "cb": {
16
- "acc": 0.5357142857142857,
17
- "acc_stderr": 0.06724777654937658,
18
- "f1": 0.37227304714989445
19
- },
20
- "copa": {
21
- "acc": 0.79,
22
- "acc_stderr": 0.040936018074033256
23
- },
24
- "hellaswag": {
25
- "acc": 0.47191794463254333,
26
- "acc_stderr": 0.004981905293878145,
27
- "acc_norm": 0.6139215295757817,
28
- "acc_norm_stderr": 0.004858539527872466
29
- },
30
- "rte": {
31
- "acc": 0.5703971119133574,
32
- "acc_stderr": 0.029796668829124674
33
- },
34
- "winogrande": {
35
- "acc": 0.5706393054459353,
36
- "acc_stderr": 0.013911537499969163
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7151256012827365,
40
- "acc_stderr": 0.01043751398661172
41
- },
42
- "boolq": {
43
- "acc": 0.5669724770642202,
44
- "acc_stderr": 0.00866625130551806
45
- },
46
- "arc_easy": {
47
- "acc": 0.5913299663299664,
48
- "acc_stderr": 0.010087174498762883,
49
- "acc_norm": 0.5496632996632996,
50
- "acc_norm_stderr": 0.010209047724374145
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2627986348122867,
54
- "acc_stderr": 0.012862523175351333,
55
- "acc_norm": 0.30716723549488056,
56
- "acc_norm_stderr": 0.013481034054980943
57
- },
58
- "sciq": {
59
- "acc": 0.836,
60
- "acc_stderr": 0.011715000693181331,
61
- "acc_norm": 0.781,
62
- "acc_norm_stderr": 0.013084731950262012
63
- },
64
- "piqa": {
65
- "acc": 0.7448313384113167,
66
- "acc_stderr": 0.010171571592521822,
67
- "acc_norm": 0.7535364526659413,
68
- "acc_norm_stderr": 0.01005481078967181
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b12bc4/evaluation/4b284b12bc4_2_lm-eval_global_step80108_2023-01-30-11-26-32_2shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.329,
5
- "acc_stderr": 0.014865395385928354
6
- },
7
- "anli_r2": {
8
- "acc": 0.336,
9
- "acc_stderr": 0.014944140233795027
10
- },
11
- "anli_r3": {
12
- "acc": 0.3383333333333333,
13
- "acc_stderr": 0.013664144006618266
14
- },
15
- "cb": {
16
- "acc": 0.48214285714285715,
17
- "acc_stderr": 0.06737697508644648,
18
- "f1": 0.3338011695906433
19
- },
20
- "copa": {
21
- "acc": 0.79,
22
- "acc_stderr": 0.040936018074033256
23
- },
24
- "hellaswag": {
25
- "acc": 0.4697271459868552,
26
- "acc_stderr": 0.004980627287147585,
27
- "acc_norm": 0.6141206930890261,
28
- "acc_norm_stderr": 0.004858074013443988
29
- },
30
- "rte": {
31
- "acc": 0.5523465703971119,
32
- "acc_stderr": 0.02993107036293953
33
- },
34
- "winogrande": {
35
- "acc": 0.574585635359116,
36
- "acc_stderr": 0.013895257666646378
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7156600748262961,
40
- "acc_stderr": 0.010431614128665253
41
- },
42
- "boolq": {
43
- "acc": 0.5660550458715596,
44
- "acc_stderr": 0.008668405003744129
45
- },
46
- "arc_easy": {
47
- "acc": 0.5993265993265994,
48
- "acc_stderr": 0.01005530447425557,
49
- "acc_norm": 0.5576599326599326,
50
- "acc_norm_stderr": 0.01019133444422085
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2781569965870307,
54
- "acc_stderr": 0.013094469919538805,
55
- "acc_norm": 0.30887372013651876,
56
- "acc_norm_stderr": 0.013501770929344003
57
- },
58
- "sciq": {
59
- "acc": 0.835,
60
- "acc_stderr": 0.011743632866916145,
61
- "acc_norm": 0.79,
62
- "acc_norm_stderr": 0.01288666233227453
63
- },
64
- "piqa": {
65
- "acc": 0.7470076169749728,
66
- "acc_stderr": 0.01014288869886246,
67
- "acc_norm": 0.7519042437431991,
68
- "acc_norm_stderr": 0.010077118315574706
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b12bc4/evaluation/4b284b12bc4_3_lm-eval_global_step80108_2023-01-30-11-26-31_3shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.332,
5
- "acc_stderr": 0.014899597242811485
6
- },
7
- "anli_r2": {
8
- "acc": 0.334,
9
- "acc_stderr": 0.014922019523732963
10
- },
11
- "anli_r3": {
12
- "acc": 0.35,
13
- "acc_stderr": 0.013774667009018554
14
- },
15
- "cb": {
16
- "acc": 0.6071428571428571,
17
- "acc_stderr": 0.0658538889806635,
18
- "f1": 0.42400932400932395
19
- },
20
- "copa": {
21
- "acc": 0.81,
22
- "acc_stderr": 0.03942772444036622
23
- },
24
- "hellaswag": {
25
- "acc": 0.47241585341565423,
26
- "acc_stderr": 0.004982182323923561,
27
- "acc_norm": 0.6199960167297351,
28
- "acc_norm_stderr": 0.004843954338451449
29
- },
30
- "rte": {
31
- "acc": 0.5379061371841155,
32
- "acc_stderr": 0.030009848912529113
33
- },
34
- "winogrande": {
35
- "acc": 0.5737963693764798,
36
- "acc_stderr": 0.013898585965412338
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7124532335649385,
40
- "acc_stderr": 0.010466744473098363
41
- },
42
- "boolq": {
43
- "acc": 0.5587155963302752,
44
- "acc_stderr": 0.008684548127832637
45
- },
46
- "arc_easy": {
47
- "acc": 0.5955387205387206,
48
- "acc_stderr": 0.010070746648278783,
49
- "acc_norm": 0.5740740740740741,
50
- "acc_norm_stderr": 0.010146568651002255
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2815699658703072,
54
- "acc_stderr": 0.013143376735009022,
55
- "acc_norm": 0.3122866894197952,
56
- "acc_norm_stderr": 0.013542598541688067
57
- },
58
- "sciq": {
59
- "acc": 0.841,
60
- "acc_stderr": 0.01156947936827129,
61
- "acc_norm": 0.796,
62
- "acc_norm_stderr": 0.012749374359024384
63
- },
64
- "piqa": {
65
- "acc": 0.7513601741022851,
66
- "acc_stderr": 0.01008451123429685,
67
- "acc_norm": 0.7578890097932536,
68
- "acc_norm_stderr": 0.009994371269104397
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b12bc4/evaluation/4b284b12bc4_4_lm-eval_global_step80108_2023-01-30-11-26-32_4shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.337,
5
- "acc_stderr": 0.014955087918653603
6
- },
7
- "anli_r2": {
8
- "acc": 0.349,
9
- "acc_stderr": 0.015080663991563102
10
- },
11
- "anli_r3": {
12
- "acc": 0.36666666666666664,
13
- "acc_stderr": 0.013916893275819938
14
- },
15
- "cb": {
16
- "acc": 0.44642857142857145,
17
- "acc_stderr": 0.067031892279424,
18
- "f1": 0.3176100628930817
19
- },
20
- "copa": {
21
- "acc": 0.8,
22
- "acc_stderr": 0.040201512610368445
23
- },
24
- "hellaswag": {
25
- "acc": 0.4722166899024099,
26
- "acc_stderr": 0.004982072108448081,
27
- "acc_norm": 0.6184027086237801,
28
- "acc_norm_stderr": 0.004847857546957481
29
- },
30
- "rte": {
31
- "acc": 0.5379061371841155,
32
- "acc_stderr": 0.03000984891252911
33
- },
34
- "winogrande": {
35
- "acc": 0.56353591160221,
36
- "acc_stderr": 0.013938569465677023
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7194013896312133,
40
- "acc_stderr": 0.010389809647288821
41
- },
42
- "boolq": {
43
- "acc": 0.5636085626911315,
44
- "acc_stderr": 0.008674000467432068
45
- },
46
- "arc_easy": {
47
- "acc": 0.6039562289562289,
48
- "acc_stderr": 0.010035580962097942,
49
- "acc_norm": 0.5702861952861953,
50
- "acc_norm_stderr": 0.010157908005763674
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2790102389078498,
54
- "acc_stderr": 0.013106784883601346,
55
- "acc_norm": 0.3165529010238908,
56
- "acc_norm_stderr": 0.013592431519068077
57
- },
58
- "sciq": {
59
- "acc": 0.842,
60
- "acc_stderr": 0.011539894677559568,
61
- "acc_norm": 0.789,
62
- "acc_norm_stderr": 0.012909130321042092
63
- },
64
- "piqa": {
65
- "acc": 0.7431991294885746,
66
- "acc_stderr": 0.010192864802278045,
67
- "acc_norm": 0.7568008705114254,
68
- "acc_norm_stderr": 0.010009611953858915
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b12bc4/evaluation/4b284b12bc4_5_lm-eval_global_step80108_2023-01-30-11-26-32_5shots_backup.json DELETED
@@ -1,66 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.332,
5
- "acc_stderr": 0.014899597242811487
6
- },
7
- "anli_r2": {
8
- "acc": 0.329,
9
- "acc_stderr": 0.014865395385928357
10
- },
11
- "anli_r3": {
12
- "acc": 0.3541666666666667,
13
- "acc_stderr": 0.013811933499570954
14
- },
15
- "cb": {
16
- "acc": 0.5535714285714286,
17
- "acc_stderr": 0.06703189227942395,
18
- "f1": 0.38376730002345766
19
- },
20
- "copa": {
21
- "acc": 0.81,
22
- "acc_stderr": 0.03942772444036623
23
- },
24
- "hellaswag": {
25
- "acc": 0.47400916152160927,
26
- "acc_stderr": 0.004983035420235716,
27
- "acc_norm": 0.619896434973113,
28
- "acc_norm_stderr": 0.004844199910173026
29
- },
30
- "rte": {
31
- "acc": 0.516245487364621,
32
- "acc_stderr": 0.030080573208738064
33
- },
34
- "winogrande": {
35
- "acc": 0.5722178374112076,
36
- "acc_stderr": 0.013905134013839944
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7177979690005345,
40
- "acc_stderr": 0.010407834479647675
41
- },
42
- "boolq": {
43
- "acc": 0.5648318042813456,
44
- "acc_stderr": 0.008671229580582118
45
- },
46
- "arc_easy": {
47
- "acc": 0.5997474747474747,
48
- "acc_stderr": 0.010053550119896127,
49
- "acc_norm": 0.569023569023569,
50
- "acc_norm_stderr": 0.010161552863493746
51
- }
52
- },
53
- "versions": {
54
- "anli_r1": 0,
55
- "anli_r2": 0,
56
- "anli_r3": 0,
57
- "cb": 1,
58
- "copa": 0,
59
- "hellaswag": 0,
60
- "rte": 0,
61
- "winogrande": 0,
62
- "storycloze_2016": 0,
63
- "boolq": 1,
64
- "arc_easy": 0
65
- }
66
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b12bc4/evaluation/{4b284b12bc4_0.json β†’ rankeval/4b284b12bc4_0.json} RENAMED
File without changes
4b284b12bc4/evaluation/{4b284b12bc4_1.json β†’ rankeval/4b284b12bc4_1.json} RENAMED
File without changes
4b284b12bc4/evaluation/{4b284b12bc4_2.json β†’ rankeval/4b284b12bc4_2.json} RENAMED
File without changes
4b284b12bc4/evaluation/{4b284b12bc4_3.json β†’ rankeval/4b284b12bc4_3.json} RENAMED
File without changes
4b284b12bc4/evaluation/{4b284b12bc4_4.json β†’ rankeval/4b284b12bc4_4.json} RENAMED
File without changes
4b284b12bc4/evaluation/{4b284b12bc4_5.json β†’ rankeval/4b284b12bc4_5.json} RENAMED
@@ -48,6 +48,24 @@
48
  "acc_stderr": 0.010053550119896127,
49
  "acc_norm": 0.569023569023569,
50
  "acc_norm_stderr": 0.010161552863493746
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  }
52
  },
53
  "versions": {
@@ -61,6 +79,9 @@
61
  "winogrande": 0,
62
  "storycloze_2016": 0,
63
  "boolq": 1,
64
- "arc_easy": 0
 
 
 
65
  }
66
  }
 
48
  "acc_stderr": 0.010053550119896127,
49
  "acc_norm": 0.569023569023569,
50
  "acc_norm_stderr": 0.010161552863493746
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.27559726962457337,
54
+ "acc_stderr": 0.01305716965576184,
55
+ "acc_norm": 0.31569965870307165,
56
+ "acc_norm_stderr": 0.013582571095815291
57
+ },
58
+ "sciq": {
59
+ "acc": 0.844,
60
+ "acc_stderr": 0.01148023500612236,
61
+ "acc_norm": 0.794,
62
+ "acc_norm_stderr": 0.012795613612786551
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7399347116430903,
66
+ "acc_stderr": 0.0102348932490613,
67
+ "acc_norm": 0.7595212187159956,
68
+ "acc_norm_stderr": 0.009971345364651064
69
  }
70
  },
71
  "versions": {
 
79
  "winogrande": 0,
80
  "storycloze_2016": 0,
81
  "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
4b284b17bc4/evaluation/4b284b17bc4_0_lm-eval_global_step80108_2023-01-30-11-26-40_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.332,
5
- "acc_stderr": 0.014899597242811478
6
- },
7
- "anli_r2": {
8
- "acc": 0.329,
9
- "acc_stderr": 0.014865395385928362
10
- },
11
- "anli_r3": {
12
- "acc": 0.34833333333333333,
13
- "acc_stderr": 0.013759437498874075
14
- },
15
- "cb": {
16
- "acc": 0.5714285714285714,
17
- "acc_stderr": 0.06672848092813058,
18
- "f1": 0.3888888888888889
19
- },
20
- "copa": {
21
- "acc": 0.76,
22
- "acc_stderr": 0.04292346959909283
23
- },
24
- "hellaswag": {
25
- "acc": 0.469627564230233,
26
- "acc_stderr": 0.004980566907790459,
27
- "acc_norm": 0.6134236207926708,
28
- "acc_norm_stderr": 0.004859699562451462
29
- },
30
- "rte": {
31
- "acc": 0.5415162454873647,
32
- "acc_stderr": 0.029992535385373314
33
- },
34
- "winogrande": {
35
- "acc": 0.5737963693764798,
36
- "acc_stderr": 0.013898585965412338
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7108498129342598,
40
- "acc_stderr": 0.010484068799942072
41
- },
42
- "boolq": {
43
- "acc": 0.5623853211009174,
44
- "acc_stderr": 0.008676717715731632
45
- },
46
- "arc_easy": {
47
- "acc": 0.6052188552188552,
48
- "acc_stderr": 0.010030038935883584,
49
- "acc_norm": 0.5429292929292929,
50
- "acc_norm_stderr": 0.01022189756425604
51
- },
52
- "arc_challenge": {
53
- "acc": 0.26791808873720135,
54
- "acc_stderr": 0.012942030195136437,
55
- "acc_norm": 0.2883959044368601,
56
- "acc_norm_stderr": 0.013238394422428171
57
- },
58
- "sciq": {
59
- "acc": 0.852,
60
- "acc_stderr": 0.011234866364235235,
61
- "acc_norm": 0.764,
62
- "acc_norm_stderr": 0.013434451402438678
63
- },
64
- "piqa": {
65
- "acc": 0.7578890097932536,
66
- "acc_stderr": 0.00999437126910438,
67
- "acc_norm": 0.7622415669205659,
68
- "acc_norm_stderr": 0.009932525779525492
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b17bc4/evaluation/4b284b17bc4_1_lm-eval_global_step80108_2023-01-30-11-26-39_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.32,
5
- "acc_stderr": 0.014758652303574886
6
- },
7
- "anli_r2": {
8
- "acc": 0.324,
9
- "acc_stderr": 0.014806864733738854
10
- },
11
- "anli_r3": {
12
- "acc": 0.3491666666666667,
13
- "acc_stderr": 0.01376707539507725
14
- },
15
- "cb": {
16
- "acc": 0.5535714285714286,
17
- "acc_stderr": 0.06703189227942397,
18
- "f1": 0.3890671420083185
19
- },
20
- "copa": {
21
- "acc": 0.75,
22
- "acc_stderr": 0.04351941398892446
23
- },
24
- "hellaswag": {
25
- "acc": 0.4640509858593906,
26
- "acc_stderr": 0.0049768677965835555,
27
- "acc_norm": 0.6082453694483171,
28
- "acc_norm_stderr": 0.004871447106554927
29
- },
30
- "rte": {
31
- "acc": 0.5451263537906137,
32
- "acc_stderr": 0.029973636495415252
33
- },
34
- "winogrande": {
35
- "acc": 0.574585635359116,
36
- "acc_stderr": 0.013895257666646378
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.711918760021379,
40
- "acc_stderr": 0.010472537019822582
41
- },
42
- "boolq": {
43
- "acc": 0.5409785932721712,
44
- "acc_stderr": 0.008715635308774412
45
- },
46
- "arc_easy": {
47
- "acc": 0.6342592592592593,
48
- "acc_stderr": 0.009882988069418829,
49
- "acc_norm": 0.5837542087542088,
50
- "acc_norm_stderr": 0.01011481940450087
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2901023890784983,
54
- "acc_stderr": 0.013261573677520764,
55
- "acc_norm": 0.30119453924914674,
56
- "acc_norm_stderr": 0.013406741767847638
57
- },
58
- "sciq": {
59
- "acc": 0.896,
60
- "acc_stderr": 0.009658016218524301,
61
- "acc_norm": 0.88,
62
- "acc_norm_stderr": 0.010281328012747386
63
- },
64
- "piqa": {
65
- "acc": 0.7551686615886833,
66
- "acc_stderr": 0.010032309105568793,
67
- "acc_norm": 0.766050054406964,
68
- "acc_norm_stderr": 0.009877236895137436
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b17bc4/evaluation/4b284b17bc4_2_lm-eval_global_step80108_2023-01-30-11-26-39_2shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.343,
5
- "acc_stderr": 0.015019206922356953
6
- },
7
- "anli_r2": {
8
- "acc": 0.318,
9
- "acc_stderr": 0.014734079309311901
10
- },
11
- "anli_r3": {
12
- "acc": 0.325,
13
- "acc_stderr": 0.013526454480351028
14
- },
15
- "cb": {
16
- "acc": 0.42857142857142855,
17
- "acc_stderr": 0.06672848092813058,
18
- "f1": 0.3058470764617691
19
- },
20
- "copa": {
21
- "acc": 0.78,
22
- "acc_stderr": 0.04163331998932263
23
- },
24
- "hellaswag": {
25
- "acc": 0.45727942640908187,
26
- "acc_stderr": 0.004971534874389935,
27
- "acc_norm": 0.602867954590719,
28
- "acc_norm_stderr": 0.004883037758919964
29
- },
30
- "rte": {
31
- "acc": 0.48736462093862815,
32
- "acc_stderr": 0.030086851767188564
33
- },
34
- "winogrande": {
35
- "acc": 0.5808997632202052,
36
- "acc_stderr": 0.013867325192210116
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7215392838054516,
40
- "acc_stderr": 0.010365521460604415
41
- },
42
- "boolq": {
43
- "acc": 0.5489296636085627,
44
- "acc_stderr": 0.008703080962379622
45
- },
46
- "arc_easy": {
47
- "acc": 0.6325757575757576,
48
- "acc_stderr": 0.009892552616211558,
49
- "acc_norm": 0.617003367003367,
50
- "acc_norm_stderr": 0.009974920384536479
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2901023890784983,
54
- "acc_stderr": 0.013261573677520759,
55
- "acc_norm": 0.31313993174061433,
56
- "acc_norm_stderr": 0.013552671543623496
57
- },
58
- "sciq": {
59
- "acc": 0.906,
60
- "acc_stderr": 0.009233052000787738,
61
- "acc_norm": 0.891,
62
- "acc_norm_stderr": 0.009859828407037186
63
- },
64
- "piqa": {
65
- "acc": 0.7540805223068553,
66
- "acc_stderr": 0.010047331865625194,
67
- "acc_norm": 0.7698585418933623,
68
- "acc_norm_stderr": 0.009820832826839796
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b17bc4/evaluation/4b284b17bc4_3_lm-eval_global_step80108_2023-01-30-11-26-39_3shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.348,
5
- "acc_stderr": 0.015070604603768408
6
- },
7
- "anli_r2": {
8
- "acc": 0.36,
9
- "acc_stderr": 0.01518652793204012
10
- },
11
- "anli_r3": {
12
- "acc": 0.35083333333333333,
13
- "acc_stderr": 0.013782212417178195
14
- },
15
- "cb": {
16
- "acc": 0.48214285714285715,
17
- "acc_stderr": 0.0673769750864465,
18
- "f1": 0.40387403446226977
19
- },
20
- "copa": {
21
- "acc": 0.79,
22
- "acc_stderr": 0.040936018074033256
23
- },
24
- "hellaswag": {
25
- "acc": 0.4567815176259709,
26
- "acc_stderr": 0.004971106265046551,
27
- "acc_norm": 0.5992830113523202,
28
- "acc_norm_stderr": 0.004890422457747258
29
- },
30
- "rte": {
31
- "acc": 0.48375451263537905,
32
- "acc_stderr": 0.030080573208738064
33
- },
34
- "winogrande": {
35
- "acc": 0.569060773480663,
36
- "acc_stderr": 0.013917796623335966
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7247461250668092,
40
- "acc_stderr": 0.010328538400500567
41
- },
42
- "boolq": {
43
- "acc": 0.5498470948012233,
44
- "acc_stderr": 0.008701488203356937
45
- },
46
- "arc_easy": {
47
- "acc": 0.6266835016835017,
48
- "acc_stderr": 0.009925009142802903,
49
- "acc_norm": 0.6203703703703703,
50
- "acc_norm_stderr": 0.009958037725468558
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2901023890784983,
54
- "acc_stderr": 0.013261573677520769,
55
- "acc_norm": 0.31143344709897613,
56
- "acc_norm_stderr": 0.013532472099850949
57
- },
58
- "sciq": {
59
- "acc": 0.923,
60
- "acc_stderr": 0.008434580140240632,
61
- "acc_norm": 0.903,
62
- "acc_norm_stderr": 0.00936368937324812
63
- },
64
- "piqa": {
65
- "acc": 0.7578890097932536,
66
- "acc_stderr": 0.009994371269104387,
67
- "acc_norm": 0.7682263329706203,
68
- "acc_norm_stderr": 0.00984514377279405
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b17bc4/evaluation/4b284b17bc4_4_lm-eval_global_step80108_2023-01-30-11-26-39_4shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.36,
5
- "acc_stderr": 0.015186527932040117
6
- },
7
- "anli_r2": {
8
- "acc": 0.347,
9
- "acc_stderr": 0.015060472031706625
10
- },
11
- "anli_r3": {
12
- "acc": 0.3625,
13
- "acc_stderr": 0.01388303787422552
14
- },
15
- "cb": {
16
- "acc": 0.5535714285714286,
17
- "acc_stderr": 0.06703189227942395,
18
- "f1": 0.4538378958668814
19
- },
20
- "copa": {
21
- "acc": 0.79,
22
- "acc_stderr": 0.040936018074033256
23
- },
24
- "hellaswag": {
25
- "acc": 0.45180242979486157,
26
- "acc_stderr": 0.004966544724452227,
27
- "acc_norm": 0.5955984863572994,
28
- "acc_norm_stderr": 0.004897728370737246
29
- },
30
- "rte": {
31
- "acc": 0.48375451263537905,
32
- "acc_stderr": 0.030080573208738064
33
- },
34
- "winogrande": {
35
- "acc": 0.5706393054459353,
36
- "acc_stderr": 0.013911537499969163
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7177979690005345,
40
- "acc_stderr": 0.010407834479647672
41
- },
42
- "boolq": {
43
- "acc": 0.545565749235474,
44
- "acc_stderr": 0.008708665643758015
45
- },
46
- "arc_easy": {
47
- "acc": 0.640993265993266,
48
- "acc_stderr": 0.009843424713072174,
49
- "acc_norm": 0.6186868686868687,
50
- "acc_norm_stderr": 0.009966542497171025
51
- },
52
- "arc_challenge": {
53
- "acc": 0.302901023890785,
54
- "acc_stderr": 0.013428241573185349,
55
- "acc_norm": 0.32337883959044367,
56
- "acc_norm_stderr": 0.013669421630012129
57
- },
58
- "sciq": {
59
- "acc": 0.915,
60
- "acc_stderr": 0.008823426366942331,
61
- "acc_norm": 0.912,
62
- "acc_norm_stderr": 0.008963053962592085
63
- },
64
- "piqa": {
65
- "acc": 0.7578890097932536,
66
- "acc_stderr": 0.009994371269104385,
67
- "acc_norm": 0.7752992383025027,
68
- "acc_norm_stderr": 0.009738282586548389
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b17bc4/evaluation/4b284b17bc4_5_lm-eval_global_step80108_2023-01-30-11-26-39_5shots_backup.json DELETED
@@ -1,73 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.363,
5
- "acc_stderr": 0.015213890444671281
6
- },
7
- "anli_r2": {
8
- "acc": 0.347,
9
- "acc_stderr": 0.015060472031706624
10
- },
11
- "anli_r3": {
12
- "acc": 0.34,
13
- "acc_stderr": 0.013680495725767794
14
- },
15
- "cb": {
16
- "acc": 0.5535714285714286,
17
- "acc_stderr": 0.06703189227942397,
18
- "f1": 0.3974410235905637
19
- },
20
- "copa": {
21
- "acc": 0.81,
22
- "acc_stderr": 0.03942772444036623
23
- },
24
- "hellaswag": {
25
- "acc": 0.44981079466241786,
26
- "acc_stderr": 0.004964579685712439,
27
- "acc_norm": 0.6002788289185421,
28
- "acc_norm_stderr": 0.004888398535520516
29
- },
30
- "rte": {
31
- "acc": 0.49097472924187724,
32
- "acc_stderr": 0.030091559826331334
33
- },
34
- "winogrande": {
35
- "acc": 0.5785319652722968,
36
- "acc_stderr": 0.013878072377497603
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7113842864778194,
40
- "acc_stderr": 0.01047831178564294
41
- },
42
- "boolq": {
43
- "acc": 0.5376146788990825,
44
- "acc_stderr": 0.008720273736433679
45
- },
46
- "arc_easy": {
47
- "acc": 0.6447811447811448,
48
- "acc_stderr": 0.009820245899287117,
49
- "acc_norm": 0.625,
50
- "acc_norm_stderr": 0.009933992677987828
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2986348122866894,
54
- "acc_stderr": 0.013374078615068756,
55
- "acc_norm": 0.310580204778157,
56
- "acc_norm_stderr": 0.013522292098053052
57
- }
58
- },
59
- "versions": {
60
- "anli_r1": 0,
61
- "anli_r2": 0,
62
- "anli_r3": 0,
63
- "cb": 1,
64
- "copa": 0,
65
- "hellaswag": 0,
66
- "rte": 0,
67
- "winogrande": 0,
68
- "storycloze_2016": 0,
69
- "boolq": 1,
70
- "arc_easy": 0,
71
- "arc_challenge": 0
72
- }
73
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b17bc4/evaluation/{4b284b17bc4_0.json β†’ rankeval/4b284b17bc4_0.json} RENAMED
File without changes
4b284b17bc4/evaluation/{4b284b17bc4_1.json β†’ rankeval/4b284b17bc4_1.json} RENAMED
File without changes
4b284b17bc4/evaluation/{4b284b17bc4_2.json β†’ rankeval/4b284b17bc4_2.json} RENAMED
File without changes
4b284b17bc4/evaluation/{4b284b17bc4_3.json β†’ rankeval/4b284b17bc4_3.json} RENAMED
File without changes
4b284b17bc4/evaluation/{4b284b17bc4_4.json β†’ rankeval/4b284b17bc4_4.json} RENAMED
File without changes
4b284b17bc4/evaluation/{4b284b17bc4_5.json β†’ rankeval/4b284b17bc4_5.json} RENAMED
@@ -54,6 +54,18 @@
54
  "acc_stderr": 0.013374078615068756,
55
  "acc_norm": 0.310580204778157,
56
  "acc_norm_stderr": 0.013522292098053052
 
 
 
 
 
 
 
 
 
 
 
 
57
  }
58
  },
59
  "versions": {
@@ -68,6 +80,8 @@
68
  "storycloze_2016": 0,
69
  "boolq": 1,
70
  "arc_easy": 0,
71
- "arc_challenge": 0
 
 
72
  }
73
  }
 
54
  "acc_stderr": 0.013374078615068756,
55
  "acc_norm": 0.310580204778157,
56
  "acc_norm_stderr": 0.013522292098053052
57
+ },
58
+ "sciq": {
59
+ "acc": 0.918,
60
+ "acc_stderr": 0.00868051561552374,
61
+ "acc_norm": 0.908,
62
+ "acc_norm_stderr": 0.009144376393151117
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7584330794341676,
66
+ "acc_stderr": 0.00998671800180446,
67
+ "acc_norm": 0.7671381936887922,
68
+ "acc_norm_stderr": 0.009861236071080757
69
  }
70
  },
71
  "versions": {
 
80
  "storycloze_2016": 0,
81
  "boolq": 1,
82
  "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
4b284b21bc4/evaluation/4b284b21bc4_0_lm-eval_global_step80108_2023-01-30-11-26-38_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.332,
5
- "acc_stderr": 0.014899597242811485
6
- },
7
- "anli_r2": {
8
- "acc": 0.337,
9
- "acc_stderr": 0.0149550879186536
10
- },
11
- "anli_r3": {
12
- "acc": 0.355,
13
- "acc_stderr": 0.013819249004047296
14
- },
15
- "cb": {
16
- "acc": 0.48214285714285715,
17
- "acc_stderr": 0.0673769750864465,
18
- "f1": 0.4347442680776014
19
- },
20
- "copa": {
21
- "acc": 0.76,
22
- "acc_stderr": 0.04292346959909283
23
- },
24
- "hellaswag": {
25
- "acc": 0.4841665006970723,
26
- "acc_stderr": 0.004987278910505115,
27
- "acc_norm": 0.6352320254929297,
28
- "acc_norm_stderr": 0.004803812631994966
29
- },
30
- "rte": {
31
- "acc": 0.5306859205776173,
32
- "acc_stderr": 0.03003973059219781
33
- },
34
- "winogrande": {
35
- "acc": 0.5990528808208366,
36
- "acc_stderr": 0.013773974554948033
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7151256012827365,
40
- "acc_stderr": 0.010437513986611718
41
- },
42
- "boolq": {
43
- "acc": 0.5669724770642202,
44
- "acc_stderr": 0.008666251305518059
45
- },
46
- "arc_easy": {
47
- "acc": 0.617003367003367,
48
- "acc_stderr": 0.009974920384536469,
49
- "acc_norm": 0.5462962962962963,
50
- "acc_norm_stderr": 0.010215708295494117
51
- },
52
- "arc_challenge": {
53
- "acc": 0.28668941979522183,
54
- "acc_stderr": 0.013214986329274757,
55
- "acc_norm": 0.30631399317406144,
56
- "acc_norm_stderr": 0.013470584417276513
57
- },
58
- "sciq": {
59
- "acc": 0.845,
60
- "acc_stderr": 0.011450157470799475,
61
- "acc_norm": 0.757,
62
- "acc_norm_stderr": 0.013569640199177458
63
- },
64
- "piqa": {
65
- "acc": 0.7578890097932536,
66
- "acc_stderr": 0.00999437126910438,
67
- "acc_norm": 0.7676822633297062,
68
- "acc_norm_stderr": 0.009853201384168243
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b21bc4/evaluation/4b284b21bc4_1_lm-eval_global_step80108_2023-01-30-11-26-38_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.336,
5
- "acc_stderr": 0.01494414023379502
6
- },
7
- "anli_r2": {
8
- "acc": 0.315,
9
- "acc_stderr": 0.014696631960792506
10
- },
11
- "anli_r3": {
12
- "acc": 0.34,
13
- "acc_stderr": 0.0136804957257678
14
- },
15
- "cb": {
16
- "acc": 0.5357142857142857,
17
- "acc_stderr": 0.06724777654937658,
18
- "f1": 0.38181818181818183
19
- },
20
- "copa": {
21
- "acc": 0.74,
22
- "acc_stderr": 0.04408440022768077
23
- },
24
- "hellaswag": {
25
- "acc": 0.48137821151165106,
26
- "acc_stderr": 0.004986319587524962,
27
- "acc_norm": 0.6344353714399522,
28
- "acc_norm_stderr": 0.004806039039008954
29
- },
30
- "rte": {
31
- "acc": 0.5451263537906137,
32
- "acc_stderr": 0.029973636495415252
33
- },
34
- "winogrande": {
35
- "acc": 0.5974743488555643,
36
- "acc_stderr": 0.013782866831703048
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7044361304115446,
40
- "acc_stderr": 0.01055177883937378
41
- },
42
- "boolq": {
43
- "acc": 0.5669724770642202,
44
- "acc_stderr": 0.008666251305518059
45
- },
46
- "arc_easy": {
47
- "acc": 0.6220538720538721,
48
- "acc_stderr": 0.009949405744045452,
49
- "acc_norm": 0.5787037037037037,
50
- "acc_norm_stderr": 0.010131882498193127
51
- },
52
- "arc_challenge": {
53
- "acc": 0.29266211604095566,
54
- "acc_stderr": 0.01329591610361942,
55
- "acc_norm": 0.32849829351535836,
56
- "acc_norm_stderr": 0.013724978465537357
57
- },
58
- "sciq": {
59
- "acc": 0.891,
60
- "acc_stderr": 0.00985982840703719,
61
- "acc_norm": 0.871,
62
- "acc_norm_stderr": 0.010605256784796579
63
- },
64
- "piqa": {
65
- "acc": 0.7551686615886833,
66
- "acc_stderr": 0.010032309105568788,
67
- "acc_norm": 0.764961915125136,
68
- "acc_norm_stderr": 0.009893146688805308
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b21bc4/evaluation/4b284b21bc4_2_lm-eval_global_step80108_2023-01-30-11-26-38_2shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.327,
5
- "acc_stderr": 0.014842213153411247
6
- },
7
- "anli_r2": {
8
- "acc": 0.333,
9
- "acc_stderr": 0.01491084616422986
10
- },
11
- "anli_r3": {
12
- "acc": 0.3408333333333333,
13
- "acc_stderr": 0.01368860079329693
14
- },
15
- "cb": {
16
- "acc": 0.5357142857142857,
17
- "acc_stderr": 0.06724777654937658,
18
- "f1": 0.3829365079365079
19
- },
20
- "copa": {
21
- "acc": 0.78,
22
- "acc_stderr": 0.04163331998932262
23
- },
24
- "hellaswag": {
25
- "acc": 0.48048197570205137,
26
- "acc_stderr": 0.00498597821493792,
27
- "acc_norm": 0.6397132045409281,
28
- "acc_norm_stderr": 0.004791024004587989
29
- },
30
- "rte": {
31
- "acc": 0.5090252707581228,
32
- "acc_stderr": 0.030091559826331334
33
- },
34
- "winogrande": {
35
- "acc": 0.6053670086819258,
36
- "acc_stderr": 0.013736915172371883
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7161945483698557,
40
- "acc_stderr": 0.01042569627973092
41
- },
42
- "boolq": {
43
- "acc": 0.5920489296636086,
44
- "acc_stderr": 0.008595583792654892
45
- },
46
- "arc_easy": {
47
- "acc": 0.622895622895623,
48
- "acc_stderr": 0.009945041946366499,
49
- "acc_norm": 0.6018518518518519,
50
- "acc_norm_stderr": 0.010044662374653398
51
- },
52
- "arc_challenge": {
53
- "acc": 0.295221843003413,
54
- "acc_stderr": 0.013329750293382318,
55
- "acc_norm": 0.32337883959044367,
56
- "acc_norm_stderr": 0.013669421630012129
57
- },
58
- "sciq": {
59
- "acc": 0.903,
60
- "acc_stderr": 0.009363689373248092,
61
- "acc_norm": 0.882,
62
- "acc_norm_stderr": 0.010206869264381791
63
- },
64
- "piqa": {
65
- "acc": 0.7578890097932536,
66
- "acc_stderr": 0.009994371269104376,
67
- "acc_norm": 0.7682263329706203,
68
- "acc_norm_stderr": 0.009845143772794043
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b21bc4/evaluation/4b284b21bc4_3_lm-eval_global_step80108_2023-01-30-11-26-38_3shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.338,
5
- "acc_stderr": 0.014965960710224496
6
- },
7
- "anli_r2": {
8
- "acc": 0.345,
9
- "acc_stderr": 0.015039986742055238
10
- },
11
- "anli_r3": {
12
- "acc": 0.3566666666666667,
13
- "acc_stderr": 0.013833742805050717
14
- },
15
- "cb": {
16
- "acc": 0.6071428571428571,
17
- "acc_stderr": 0.0658538889806635,
18
- "f1": 0.5367003367003368
19
- },
20
- "copa": {
21
- "acc": 0.8,
22
- "acc_stderr": 0.040201512610368445
23
- },
24
- "hellaswag": {
25
- "acc": 0.4826727743477395,
26
- "acc_stderr": 0.004986784319771787,
27
- "acc_norm": 0.6368253335988847,
28
- "acc_norm_stderr": 0.004799317209902001
29
- },
30
- "rte": {
31
- "acc": 0.5631768953068592,
32
- "acc_stderr": 0.029855247390314945
33
- },
34
- "winogrande": {
35
- "acc": 0.6037884767166535,
36
- "acc_stderr": 0.013746404157154949
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7204703367183325,
40
- "acc_stderr": 0.01037770209970486
41
- },
42
- "boolq": {
43
- "acc": 0.5923547400611621,
44
- "acc_stderr": 0.008594580270731619
45
- },
46
- "arc_easy": {
47
- "acc": 0.627104377104377,
48
- "acc_stderr": 0.009922743197129257,
49
- "acc_norm": 0.609006734006734,
50
- "acc_norm_stderr": 0.010012992232540631
51
- },
52
- "arc_challenge": {
53
- "acc": 0.29436860068259385,
54
- "acc_stderr": 0.013318528460539429,
55
- "acc_norm": 0.3319112627986348,
56
- "acc_norm_stderr": 0.01376098820088054
57
- },
58
- "sciq": {
59
- "acc": 0.913,
60
- "acc_stderr": 0.0089168666307459,
61
- "acc_norm": 0.897,
62
- "acc_norm_stderr": 0.009616833339695798
63
- },
64
- "piqa": {
65
- "acc": 0.7589771490750816,
66
- "acc_stderr": 0.009979042717267314,
67
- "acc_norm": 0.7742110990206746,
68
- "acc_norm_stderr": 0.009754980670917311
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b21bc4/evaluation/4b284b21bc4_4_lm-eval_global_step80108_2023-01-30-11-26-38_4shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.343,
5
- "acc_stderr": 0.015019206922356951
6
- },
7
- "anli_r2": {
8
- "acc": 0.346,
9
- "acc_stderr": 0.01505026612756445
10
- },
11
- "anli_r3": {
12
- "acc": 0.36083333333333334,
13
- "acc_stderr": 0.01386918025244486
14
- },
15
- "cb": {
16
- "acc": 0.5535714285714286,
17
- "acc_stderr": 0.06703189227942395,
18
- "f1": 0.4583333333333333
19
- },
20
- "copa": {
21
- "acc": 0.8,
22
- "acc_stderr": 0.040201512610368445
23
- },
24
- "hellaswag": {
25
- "acc": 0.48157737502489545,
26
- "acc_stderr": 0.0049863932662691625,
27
- "acc_norm": 0.6417048396733719,
28
- "acc_norm_stderr": 0.00478519504988916
29
- },
30
- "rte": {
31
- "acc": 0.5379061371841155,
32
- "acc_stderr": 0.030009848912529113
33
- },
34
- "winogrande": {
35
- "acc": 0.6085240726124704,
36
- "acc_stderr": 0.01371748707129085
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7338321753073223,
40
- "acc_stderr": 0.010220104800551206
41
- },
42
- "boolq": {
43
- "acc": 0.6119266055045871,
44
- "acc_stderr": 0.00852313058476084
45
- },
46
- "arc_easy": {
47
- "acc": 0.6283670033670034,
48
- "acc_stderr": 0.00991589712365879,
49
- "acc_norm": 0.6153198653198653,
50
- "acc_norm_stderr": 0.009983171707008997
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2960750853242321,
54
- "acc_stderr": 0.013340916085246271,
55
- "acc_norm": 0.3242320819112628,
56
- "acc_norm_stderr": 0.013678810399518819
57
- },
58
- "sciq": {
59
- "acc": 0.923,
60
- "acc_stderr": 0.008434580140240648,
61
- "acc_norm": 0.912,
62
- "acc_norm_stderr": 0.008963053962592074
63
- },
64
- "piqa": {
65
- "acc": 0.7595212187159956,
66
- "acc_stderr": 0.009971345364651078,
67
- "acc_norm": 0.7676822633297062,
68
- "acc_norm_stderr": 0.009853201384168243
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b21bc4/evaluation/4b284b21bc4_5_lm-eval_global_step80108_2023-01-30-11-26-38_5shots_backup.json DELETED
@@ -1,66 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.351,
5
- "acc_stderr": 0.015100563798316405
6
- },
7
- "anli_r2": {
8
- "acc": 0.345,
9
- "acc_stderr": 0.015039986742055237
10
- },
11
- "anli_r3": {
12
- "acc": 0.345,
13
- "acc_stderr": 0.013728421539454878
14
- },
15
- "cb": {
16
- "acc": 0.5714285714285714,
17
- "acc_stderr": 0.06672848092813058,
18
- "f1": 0.37671957671957673
19
- },
20
- "copa": {
21
- "acc": 0.78,
22
- "acc_stderr": 0.04163331998932261
23
- },
24
- "hellaswag": {
25
- "acc": 0.4827723561043617,
26
- "acc_stderr": 0.004986818680313444,
27
- "acc_norm": 0.6446922923720374,
28
- "acc_norm_stderr": 0.004776283203468094
29
- },
30
- "rte": {
31
- "acc": 0.5776173285198556,
32
- "acc_stderr": 0.02973162264649588
33
- },
34
- "winogrande": {
35
- "acc": 0.595895816890292,
36
- "acc_stderr": 0.013791610664670845
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7252805986103688,
40
- "acc_stderr": 0.010322309878339507
41
- },
42
- "boolq": {
43
- "acc": 0.6146788990825688,
44
- "acc_stderr": 0.008511930879680652
45
- },
46
- "arc_easy": {
47
- "acc": 0.6300505050505051,
48
- "acc_stderr": 0.009906656266021155,
49
- "acc_norm": 0.6111111111111112,
50
- "acc_norm_stderr": 0.01000324833531377
51
- }
52
- },
53
- "versions": {
54
- "anli_r1": 0,
55
- "anli_r2": 0,
56
- "anli_r3": 0,
57
- "cb": 1,
58
- "copa": 0,
59
- "hellaswag": 0,
60
- "rte": 0,
61
- "winogrande": 0,
62
- "storycloze_2016": 0,
63
- "boolq": 1,
64
- "arc_easy": 0
65
- }
66
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b21bc4/evaluation/{4b284b21bc4_0.json β†’ rankeval/4b284b21bc4_0.json} RENAMED
File without changes
4b284b21bc4/evaluation/{4b284b21bc4_1.json β†’ rankeval/4b284b21bc4_1.json} RENAMED
File without changes
4b284b21bc4/evaluation/{4b284b21bc4_2.json β†’ rankeval/4b284b21bc4_2.json} RENAMED
File without changes
4b284b21bc4/evaluation/{4b284b21bc4_3.json β†’ rankeval/4b284b21bc4_3.json} RENAMED
File without changes
4b284b21bc4/evaluation/{4b284b21bc4_4.json β†’ rankeval/4b284b21bc4_4.json} RENAMED
File without changes
4b284b21bc4/evaluation/{4b284b21bc4_5.json β†’ rankeval/4b284b21bc4_5.json} RENAMED
@@ -48,6 +48,24 @@
48
  "acc_stderr": 0.009906656266021155,
49
  "acc_norm": 0.6111111111111112,
50
  "acc_norm_stderr": 0.01000324833531377
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  }
52
  },
53
  "versions": {
@@ -61,6 +79,9 @@
61
  "winogrande": 0,
62
  "storycloze_2016": 0,
63
  "boolq": 1,
64
- "arc_easy": 0
 
 
 
65
  }
66
  }
 
48
  "acc_stderr": 0.009906656266021155,
49
  "acc_norm": 0.6111111111111112,
50
  "acc_norm_stderr": 0.01000324833531377
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.30716723549488056,
54
+ "acc_stderr": 0.013481034054980945,
55
+ "acc_norm": 0.32337883959044367,
56
+ "acc_norm_stderr": 0.013669421630012122
57
+ },
58
+ "sciq": {
59
+ "acc": 0.919,
60
+ "acc_stderr": 0.008632121032139978,
61
+ "acc_norm": 0.907,
62
+ "acc_norm_stderr": 0.009188875634996669
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7529923830250272,
66
+ "acc_stderr": 0.010062268140772625,
67
+ "acc_norm": 0.7671381936887922,
68
+ "acc_norm_stderr": 0.009861236071080753
69
  }
70
  },
71
  "versions": {
 
79
  "winogrande": 0,
80
  "storycloze_2016": 0,
81
  "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
4b284b28bc4/evaluation/4b284b28bc4_0_lm-eval_global_step80108_2023-01-30-11-26-39_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.344,
5
- "acc_stderr": 0.015029633724408947
6
- },
7
- "anli_r2": {
8
- "acc": 0.321,
9
- "acc_stderr": 0.01477082181793464
10
- },
11
- "anli_r3": {
12
- "acc": 0.34833333333333333,
13
- "acc_stderr": 0.01375943749887408
14
- },
15
- "cb": {
16
- "acc": 0.35714285714285715,
17
- "acc_stderr": 0.06460957383809221,
18
- "f1": 0.1754385964912281
19
- },
20
- "copa": {
21
- "acc": 0.8,
22
- "acc_stderr": 0.040201512610368445
23
- },
24
- "hellaswag": {
25
- "acc": 0.4792869946225851,
26
- "acc_stderr": 0.004985498055190357,
27
- "acc_norm": 0.6265684126667994,
28
- "acc_norm_stderr": 0.004827266662144035
29
- },
30
- "rte": {
31
- "acc": 0.5342960288808665,
32
- "acc_stderr": 0.030025579819366422
33
- },
34
- "winogrande": {
35
- "acc": 0.5753749013417522,
36
- "acc_stderr": 0.013891893150264213
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7231427044361304,
40
- "acc_stderr": 0.01034711289027692
41
- },
42
- "boolq": {
43
- "acc": 0.5700305810397553,
44
- "acc_stderr": 0.008658853690729254
45
- },
46
- "arc_easy": {
47
- "acc": 0.5984848484848485,
48
- "acc_stderr": 0.010058790020755567,
49
- "acc_norm": 0.5395622895622896,
50
- "acc_norm_stderr": 0.01022761638628902
51
- },
52
- "arc_challenge": {
53
- "acc": 0.27986348122866894,
54
- "acc_stderr": 0.013119040897725922,
55
- "acc_norm": 0.31143344709897613,
56
- "acc_norm_stderr": 0.013532472099850942
57
- },
58
- "sciq": {
59
- "acc": 0.848,
60
- "acc_stderr": 0.011358918303475274,
61
- "acc_norm": 0.769,
62
- "acc_norm_stderr": 0.013334797216936438
63
- },
64
- "piqa": {
65
- "acc": 0.7584330794341676,
66
- "acc_stderr": 0.009986718001804467,
67
- "acc_norm": 0.7633297062023939,
68
- "acc_norm_stderr": 0.009916841655042809
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b28bc4/evaluation/4b284b28bc4_1_lm-eval_global_step80108_2023-01-30-11-26-39_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.34,
5
- "acc_stderr": 0.014987482264363937
6
- },
7
- "anli_r2": {
8
- "acc": 0.321,
9
- "acc_stderr": 0.014770821817934644
10
- },
11
- "anli_r3": {
12
- "acc": 0.34,
13
- "acc_stderr": 0.013680495725767803
14
- },
15
- "cb": {
16
- "acc": 0.375,
17
- "acc_stderr": 0.06527912098338669,
18
- "f1": 0.32099491681373216
19
- },
20
- "copa": {
21
- "acc": 0.77,
22
- "acc_stderr": 0.04229525846816506
23
- },
24
- "hellaswag": {
25
- "acc": 0.48078072097191793,
26
- "acc_stderr": 0.004986093791041653,
27
- "acc_norm": 0.6337382991435969,
28
- "acc_norm_stderr": 0.004807975515446487
29
- },
30
- "rte": {
31
- "acc": 0.5740072202166066,
32
- "acc_stderr": 0.029764956741777645
33
- },
34
- "winogrande": {
35
- "acc": 0.590370955011839,
36
- "acc_stderr": 0.013821049109655453
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7204703367183325,
40
- "acc_stderr": 0.01037770209970486
41
- },
42
- "boolq": {
43
- "acc": 0.5948012232415902,
44
- "acc_stderr": 0.008586427929715515
45
- },
46
- "arc_easy": {
47
- "acc": 0.6262626262626263,
48
- "acc_stderr": 0.009927267058259628,
49
- "acc_norm": 0.5917508417508418,
50
- "acc_norm_stderr": 0.01008556619579125
51
- },
52
- "arc_challenge": {
53
- "acc": 0.29266211604095566,
54
- "acc_stderr": 0.013295916103619417,
55
- "acc_norm": 0.32337883959044367,
56
- "acc_norm_stderr": 0.013669421630012132
57
- },
58
- "sciq": {
59
- "acc": 0.904,
60
- "acc_stderr": 0.009320454434783227,
61
- "acc_norm": 0.885,
62
- "acc_norm_stderr": 0.01009340759490462
63
- },
64
- "piqa": {
65
- "acc": 0.7622415669205659,
66
- "acc_stderr": 0.009932525779525489,
67
- "acc_norm": 0.763873775843308,
68
- "acc_norm_stderr": 0.009908965890558218
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b28bc4/evaluation/4b284b28bc4_2_lm-eval_global_step80108_2023-01-30-11-26-39_2shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.325,
5
- "acc_stderr": 0.014818724459095526
6
- },
7
- "anli_r2": {
8
- "acc": 0.325,
9
- "acc_stderr": 0.014818724459095526
10
- },
11
- "anli_r3": {
12
- "acc": 0.3233333333333333,
13
- "acc_stderr": 0.013508372867300217
14
- },
15
- "cb": {
16
- "acc": 0.25,
17
- "acc_stderr": 0.058387420812114225,
18
- "f1": 0.22987012987012986
19
- },
20
- "copa": {
21
- "acc": 0.79,
22
- "acc_stderr": 0.040936018074033256
23
- },
24
- "hellaswag": {
25
- "acc": 0.47988448516231824,
26
- "acc_stderr": 0.004985741706385727,
27
- "acc_norm": 0.6363274248157738,
28
- "acc_norm_stderr": 0.004800728138792371
29
- },
30
- "rte": {
31
- "acc": 0.5631768953068592,
32
- "acc_stderr": 0.02985524739031495
33
- },
34
- "winogrande": {
35
- "acc": 0.5824782951854776,
36
- "acc_stderr": 0.013859978264440248
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7177979690005345,
40
- "acc_stderr": 0.010407834479647673
41
- },
42
- "boolq": {
43
- "acc": 0.627217125382263,
44
- "acc_stderr": 0.008457255867914694
45
- },
46
- "arc_easy": {
47
- "acc": 0.6308922558922558,
48
- "acc_stderr": 0.009901987410242742,
49
- "acc_norm": 0.6123737373737373,
50
- "acc_norm_stderr": 0.009997307914447612
51
- },
52
- "arc_challenge": {
53
- "acc": 0.30204778156996587,
54
- "acc_stderr": 0.01341751914471642,
55
- "acc_norm": 0.3216723549488055,
56
- "acc_norm_stderr": 0.013650488084494162
57
- },
58
- "sciq": {
59
- "acc": 0.914,
60
- "acc_stderr": 0.008870325962594766,
61
- "acc_norm": 0.883,
62
- "acc_norm_stderr": 0.010169287802713329
63
- },
64
- "piqa": {
65
- "acc": 0.7606093579978237,
66
- "acc_stderr": 0.009955884250291681,
67
- "acc_norm": 0.76550598476605,
68
- "acc_norm_stderr": 0.009885203143240543
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b28bc4/evaluation/4b284b28bc4_3_lm-eval_global_step80108_2023-01-30-11-26-39_3shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.325,
5
- "acc_stderr": 0.014818724459095524
6
- },
7
- "anli_r2": {
8
- "acc": 0.336,
9
- "acc_stderr": 0.014944140233795021
10
- },
11
- "anli_r3": {
12
- "acc": 0.3233333333333333,
13
- "acc_stderr": 0.013508372867300212
14
- },
15
- "cb": {
16
- "acc": 0.39285714285714285,
17
- "acc_stderr": 0.0658538889806635,
18
- "f1": 0.3565868967138097
19
- },
20
- "copa": {
21
- "acc": 0.81,
22
- "acc_stderr": 0.03942772444036623
23
- },
24
- "hellaswag": {
25
- "acc": 0.4790878311093408,
26
- "acc_stderr": 0.004985415250690914,
27
- "acc_norm": 0.634833698466441,
28
- "acc_norm_stderr": 0.004804927608773137
29
- },
30
- "rte": {
31
- "acc": 0.6064981949458483,
32
- "acc_stderr": 0.029405839314203194
33
- },
34
- "winogrande": {
35
- "acc": 0.585635359116022,
36
- "acc_stderr": 0.013844846232268563
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7295563869588455,
40
- "acc_stderr": 0.010271810373331027
41
- },
42
- "boolq": {
43
- "acc": 0.6241590214067279,
44
- "acc_stderr": 0.008471147248160107
45
- },
46
- "arc_easy": {
47
- "acc": 0.6372053872053872,
48
- "acc_stderr": 0.009865936757013942,
49
- "acc_norm": 0.6186868686868687,
50
- "acc_norm_stderr": 0.009966542497171021
51
- },
52
- "arc_challenge": {
53
- "acc": 0.30119453924914674,
54
- "acc_stderr": 0.013406741767847624,
55
- "acc_norm": 0.32337883959044367,
56
- "acc_norm_stderr": 0.01366942163001213
57
- },
58
- "sciq": {
59
- "acc": 0.91,
60
- "acc_stderr": 0.00905439020486644,
61
- "acc_norm": 0.897,
62
- "acc_norm_stderr": 0.009616833339695796
63
- },
64
- "piqa": {
65
- "acc": 0.7540805223068553,
66
- "acc_stderr": 0.01004733186562519,
67
- "acc_norm": 0.7687704026115343,
68
- "acc_norm_stderr": 0.009837063180625334
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b28bc4/evaluation/4b284b28bc4_4_lm-eval_global_step80108_2023-01-30-11-26-39_4shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.345,
5
- "acc_stderr": 0.015039986742055235
6
- },
7
- "anli_r2": {
8
- "acc": 0.325,
9
- "acc_stderr": 0.014818724459095526
10
- },
11
- "anli_r3": {
12
- "acc": 0.31416666666666665,
13
- "acc_stderr": 0.013405399314984096
14
- },
15
- "cb": {
16
- "acc": 0.39285714285714285,
17
- "acc_stderr": 0.0658538889806635,
18
- "f1": 0.3647495361781076
19
- },
20
- "copa": {
21
- "acc": 0.82,
22
- "acc_stderr": 0.038612291966536955
23
- },
24
- "hellaswag": {
25
- "acc": 0.4819757020513842,
26
- "acc_stderr": 0.004986538243846636,
27
- "acc_norm": 0.6387173869747063,
28
- "acc_norm_stderr": 0.004793904922401888
29
- },
30
- "rte": {
31
- "acc": 0.48736462093862815,
32
- "acc_stderr": 0.030086851767188564
33
- },
34
- "winogrande": {
35
- "acc": 0.5832675611681136,
36
- "acc_stderr": 0.013856250072796322
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7386424371993586,
40
- "acc_stderr": 0.010160471460690485
41
- },
42
- "boolq": {
43
- "acc": 0.6275229357798165,
44
- "acc_stderr": 0.008455846866956085
45
- },
46
- "arc_easy": {
47
- "acc": 0.6405723905723906,
48
- "acc_stderr": 0.009845958893373766,
49
- "acc_norm": 0.6212121212121212,
50
- "acc_norm_stderr": 0.00995373765654204
51
- },
52
- "arc_challenge": {
53
- "acc": 0.30204778156996587,
54
- "acc_stderr": 0.01341751914471642,
55
- "acc_norm": 0.32764505119453924,
56
- "acc_norm_stderr": 0.013715847940719344
57
- },
58
- "sciq": {
59
- "acc": 0.92,
60
- "acc_stderr": 0.008583336977753653,
61
- "acc_norm": 0.907,
62
- "acc_norm_stderr": 0.009188875634996702
63
- },
64
- "piqa": {
65
- "acc": 0.7551686615886833,
66
- "acc_stderr": 0.01003230910556879,
67
- "acc_norm": 0.76550598476605,
68
- "acc_norm_stderr": 0.00988520314324054
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b28bc4/evaluation/4b284b28bc4_5_lm-eval_global_step80108_2023-01-30-11-26-39_5shots_backup.json DELETED
@@ -1,59 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.332,
5
- "acc_stderr": 0.014899597242811475
6
- },
7
- "anli_r2": {
8
- "acc": 0.316,
9
- "acc_stderr": 0.014709193056057106
10
- },
11
- "anli_r3": {
12
- "acc": 0.31666666666666665,
13
- "acc_stderr": 0.013434078660827384
14
- },
15
- "cb": {
16
- "acc": 0.30357142857142855,
17
- "acc_stderr": 0.06199938655510754,
18
- "f1": 0.2503507986266607
19
- },
20
- "copa": {
21
- "acc": 0.8,
22
- "acc_stderr": 0.040201512610368445
23
- },
24
- "hellaswag": {
25
- "acc": 0.4788886675960964,
26
- "acc_stderr": 0.004985331652408345,
27
- "acc_norm": 0.6412069308902609,
28
- "acc_norm_stderr": 0.004786660691181937
29
- },
30
- "rte": {
31
- "acc": 0.5740072202166066,
32
- "acc_stderr": 0.02976495674177765
33
- },
34
- "winogrande": {
35
- "acc": 0.5911602209944752,
36
- "acc_stderr": 0.013816954295135684
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7279529663281668,
40
- "acc_stderr": 0.010290888060871242
41
- },
42
- "boolq": {
43
- "acc": 0.6275229357798165,
44
- "acc_stderr": 0.008455846866956086
45
- }
46
- },
47
- "versions": {
48
- "anli_r1": 0,
49
- "anli_r2": 0,
50
- "anli_r3": 0,
51
- "cb": 1,
52
- "copa": 0,
53
- "hellaswag": 0,
54
- "rte": 0,
55
- "winogrande": 0,
56
- "storycloze_2016": 0,
57
- "boolq": 1
58
- }
59
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b28bc4/evaluation/{4b284b28bc4_0.json β†’ rankeval/4b284b28bc4_0.json} RENAMED
File without changes
4b284b28bc4/evaluation/{4b284b28bc4_1.json β†’ rankeval/4b284b28bc4_1.json} RENAMED
File without changes
4b284b28bc4/evaluation/{4b284b28bc4_2.json β†’ rankeval/4b284b28bc4_2.json} RENAMED
File without changes
4b284b28bc4/evaluation/{4b284b28bc4_3.json β†’ rankeval/4b284b28bc4_3.json} RENAMED
File without changes
4b284b28bc4/evaluation/{4b284b28bc4_4.json β†’ rankeval/4b284b28bc4_4.json} RENAMED
File without changes
4b284b28bc4/evaluation/{4b284b28bc4_5.json β†’ rankeval/4b284b28bc4_5.json} RENAMED
@@ -42,6 +42,30 @@
42
  "boolq": {
43
  "acc": 0.6275229357798165,
44
  "acc_stderr": 0.008455846866956086
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  }
46
  },
47
  "versions": {
@@ -54,6 +78,10 @@
54
  "rte": 0,
55
  "winogrande": 0,
56
  "storycloze_2016": 0,
57
- "boolq": 1
 
 
 
 
58
  }
59
  }
 
42
  "boolq": {
43
  "acc": 0.6275229357798165,
44
  "acc_stderr": 0.008455846866956086
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.6401515151515151,
48
+ "acc_stderr": 0.009848484848484846,
49
+ "acc_norm": 0.6296296296296297,
50
+ "acc_norm_stderr": 0.009908978578665755
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.30887372013651876,
54
+ "acc_stderr": 0.013501770929344003,
55
+ "acc_norm": 0.32849829351535836,
56
+ "acc_norm_stderr": 0.013724978465537377
57
+ },
58
+ "sciq": {
59
+ "acc": 0.921,
60
+ "acc_stderr": 0.008534156773333445,
61
+ "acc_norm": 0.908,
62
+ "acc_norm_stderr": 0.00914437639315112
63
+ },
64
+ "piqa": {
65
+ "acc": 0.750272034820457,
66
+ "acc_stderr": 0.010099232969867486,
67
+ "acc_norm": 0.764961915125136,
68
+ "acc_norm_stderr": 0.009893146688805312
69
  }
70
  },
71
  "versions": {
 
78
  "rte": 0,
79
  "winogrande": 0,
80
  "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
4b284b42bc4/evaluation/4b284b42bc4_0_lm-eval_global_step80108_2023-01-30-11-26-38_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.331,
5
- "acc_stderr": 0.014888272588203931
6
- },
7
- "anli_r2": {
8
- "acc": 0.342,
9
- "acc_stderr": 0.01500870618212173
10
- },
11
- "anli_r3": {
12
- "acc": 0.34,
13
- "acc_stderr": 0.013680495725767784
14
- },
15
- "cb": {
16
- "acc": 0.5357142857142857,
17
- "acc_stderr": 0.06724777654937658,
18
- "f1": 0.45393112410656267
19
- },
20
- "copa": {
21
- "acc": 0.75,
22
- "acc_stderr": 0.04351941398892446
23
- },
24
- "hellaswag": {
25
- "acc": 0.4833698466440948,
26
- "acc_stderr": 0.004987020679861267,
27
- "acc_norm": 0.63433578968333,
28
- "acc_norm_stderr": 0.004806316342709393
29
- },
30
- "rte": {
31
- "acc": 0.5776173285198556,
32
- "acc_stderr": 0.029731622646495887
33
- },
34
- "winogrande": {
35
- "acc": 0.5864246250986582,
36
- "acc_stderr": 0.013840971763195303
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7204703367183325,
40
- "acc_stderr": 0.01037770209970486
41
- },
42
- "boolq": {
43
- "acc": 0.5253822629969419,
44
- "acc_stderr": 0.0087337795418535
45
- },
46
- "arc_easy": {
47
- "acc": 0.6224747474747475,
48
- "acc_stderr": 0.00994722783346943,
49
- "acc_norm": 0.5462962962962963,
50
- "acc_norm_stderr": 0.010215708295494117
51
- },
52
- "arc_challenge": {
53
- "acc": 0.27986348122866894,
54
- "acc_stderr": 0.013119040897725922,
55
- "acc_norm": 0.29266211604095566,
56
- "acc_norm_stderr": 0.01329591610361942
57
- },
58
- "sciq": {
59
- "acc": 0.837,
60
- "acc_stderr": 0.011686212712746849,
61
- "acc_norm": 0.757,
62
- "acc_norm_stderr": 0.013569640199177458
63
- },
64
- "piqa": {
65
- "acc": 0.7448313384113167,
66
- "acc_stderr": 0.010171571592521822,
67
- "acc_norm": 0.76550598476605,
68
- "acc_norm_stderr": 0.00988520314324054
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b42bc4/evaluation/4b284b42bc4_1_lm-eval_global_step80108_2023-01-30-11-26-38_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.31,
5
- "acc_stderr": 0.014632638658632902
6
- },
7
- "anli_r2": {
8
- "acc": 0.31,
9
- "acc_stderr": 0.014632638658632905
10
- },
11
- "anli_r3": {
12
- "acc": 0.3283333333333333,
13
- "acc_stderr": 0.013562032919529017
14
- },
15
- "cb": {
16
- "acc": 0.3392857142857143,
17
- "acc_stderr": 0.06384226561930825,
18
- "f1": 0.29749748849204566
19
- },
20
- "copa": {
21
- "acc": 0.79,
22
- "acc_stderr": 0.040936018074033256
23
- },
24
- "hellaswag": {
25
- "acc": 0.4803823939454292,
26
- "acc_stderr": 0.004985939292819582,
27
- "acc_norm": 0.6294562836088429,
28
- "acc_norm_stderr": 0.004819633668832538
29
- },
30
- "rte": {
31
- "acc": 0.44765342960288806,
32
- "acc_stderr": 0.02993107036293953
33
- },
34
- "winogrande": {
35
- "acc": 0.5887924230465666,
36
- "acc_stderr": 0.013829128358676874
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7049706039551042,
40
- "acc_stderr": 0.010546232606962289
41
- },
42
- "boolq": {
43
- "acc": 0.5522935779816514,
44
- "acc_stderr": 0.008697094687974059
45
- },
46
- "arc_easy": {
47
- "acc": 0.6262626262626263,
48
- "acc_stderr": 0.009927267058259621,
49
- "acc_norm": 0.5934343434343434,
50
- "acc_norm_stderr": 0.010079056419223527
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2883959044368601,
54
- "acc_stderr": 0.013238394422428173,
55
- "acc_norm": 0.3148464163822526,
56
- "acc_norm_stderr": 0.01357265770308495
57
- },
58
- "sciq": {
59
- "acc": 0.892,
60
- "acc_stderr": 0.0098200016513457,
61
- "acc_norm": 0.869,
62
- "acc_norm_stderr": 0.010674874844837954
63
- },
64
- "piqa": {
65
- "acc": 0.7486398258977149,
66
- "acc_stderr": 0.010121156016819259,
67
- "acc_norm": 0.7633297062023939,
68
- "acc_norm_stderr": 0.009916841655042809
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }