lewtun HF Staff commited on
Commit
6ad0d0d
·
1 Parent(s): a84869a

Clean up false PoT

Browse files
eval_results/deepseek-ai/deepseek-math-7b-rl/main/aimo_kaggle_hard_pot/results_2024-05-27T12-05-20.563680.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 3546137.978264133,
9
- "end_time": 3546437.319502369,
10
- "total_evaluation_time_secondes": "299.3412382360548",
11
- "model_name": "deepseek-ai/deepseek-math-7b-rl",
12
- "model_sha": "f3cd419a172ff5a100e9a563ebfe7900d78e9740",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "12.93 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_hard_pot:v0|0": {
19
- "qem": 0.02,
20
- "qem_stderr": 0.02
21
- },
22
- "all": {
23
- "qem": 0.02,
24
- "qem_stderr": 0.02
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_hard_pot:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_hard_pot:v0": {
32
- "name": "aimo_kaggle_hard_pot:v0",
33
- "prompt_function": "kaggle_hard_pot_prompt_fn_v0",
34
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_code_and_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_hard_pot:v0|0": {
64
- "hashes": {
65
- "hash_examples": "303213a38d9f7512",
66
- "hash_full_prompts": "3c670fa0e80bc301",
67
- "hash_input_tokens": "69dabd3aaebed332",
68
- "hash_cont_tokens": "f1e1dbb860f6f173"
69
- },
70
- "truncated": 50,
71
- "non_truncated": 0,
72
- "padded": 35,
73
- "non_padded": 15,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "cae48890a7c47904",
81
- "hash_full_prompts": "6f120f8d8bb938e6",
82
- "hash_input_tokens": "ff7094ff98f5f808",
83
- "hash_cont_tokens": "b3810939778104d4"
84
- },
85
- "truncated": 50,
86
- "non_truncated": 0,
87
- "padded": 35,
88
- "non_padded": 15,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/deepseek-math-7b-rl/main/aimo_kaggle_hard_pot/results_2024-05-27T12-36-27.035745.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 2511250.889234414,
9
- "end_time": 2511567.044527658,
10
- "total_evaluation_time_secondes": "316.1552932439372",
11
- "model_name": "deepseek-ai/deepseek-math-7b-rl",
12
- "model_sha": "f3cd419a172ff5a100e9a563ebfe7900d78e9740",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "12.93 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_hard_pot:v0|0": {
19
- "qem": 0.02,
20
- "qem_stderr": 0.02
21
- },
22
- "all": {
23
- "qem": 0.02,
24
- "qem_stderr": 0.02
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_hard_pot:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_hard_pot:v0": {
32
- "name": "aimo_kaggle_hard_pot:v0",
33
- "prompt_function": "kaggle_hard_pot_prompt_fn_v0",
34
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_code_and_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 50,
56
- "effective_num_docs": 50,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_hard_pot:v0|0": {
64
- "hashes": {
65
- "hash_examples": "303213a38d9f7512",
66
- "hash_full_prompts": "3c670fa0e80bc301",
67
- "hash_input_tokens": "69dabd3aaebed332",
68
- "hash_cont_tokens": "f1e1dbb860f6f173"
69
- },
70
- "truncated": 50,
71
- "non_truncated": 0,
72
- "padded": 35,
73
- "non_padded": 15,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "cae48890a7c47904",
81
- "hash_full_prompts": "6f120f8d8bb938e6",
82
- "hash_input_tokens": "ff7094ff98f5f808",
83
- "hash_cont_tokens": "b3810939778104d4"
84
- },
85
- "truncated": 50,
86
- "non_truncated": 0,
87
- "padded": 35,
88
- "non_padded": 15,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/deepseek-math-7b-rl/main/aimo_kaggle_hard_pot/results_2024-05-27T13-13-06.061324.json DELETED
@@ -1,193 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 2108140.565538177,
9
- "end_time": 2108762.006272666,
10
- "total_evaluation_time_secondes": "621.4407344893552",
11
- "model_name": "deepseek-ai/deepseek-math-7b-rl",
12
- "model_sha": "f3cd419a172ff5a100e9a563ebfe7900d78e9740",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "12.93 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_hard_pot:v0|0": {
19
- "qem": 0.02,
20
- "qem_stderr": 0.02
21
- },
22
- "custom|aimo_kaggle_hard_pot:v1|0": {
23
- "qem": 0.06,
24
- "qem_stderr": 0.033926691677251195
25
- },
26
- "custom|aimo_kaggle_hard_pot:v2|0": {
27
- "qem": 0.38,
28
- "qem_stderr": 0.06934092056863767
29
- },
30
- "custom|aimo_kaggle_hard_pot:_average|0": {
31
- "qem": 0.15333333333333335,
32
- "qem_stderr": 0.041089204081962954
33
- },
34
- "all": {
35
- "qem": 0.15333333333333335,
36
- "qem_stderr": 0.041089204081962954
37
- }
38
- },
39
- "versions": {
40
- "custom|aimo_kaggle_hard_pot:v0|0": 0,
41
- "custom|aimo_kaggle_hard_pot:v1|0": 0,
42
- "custom|aimo_kaggle_hard_pot:v2|0": 0
43
- },
44
- "config_tasks": {
45
- "custom|aimo_kaggle_hard_pot:v0": {
46
- "name": "aimo_kaggle_hard_pot:v0",
47
- "prompt_function": "kaggle_hard_pot_prompt_fn_v0",
48
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
49
- "hf_subset": "v0",
50
- "metric": [
51
- "quasi_exact_match_code_and_math"
52
- ],
53
- "hf_avail_splits": [
54
- "train"
55
- ],
56
- "evaluation_splits": [
57
- "train"
58
- ],
59
- "few_shots_split": null,
60
- "few_shots_select": null,
61
- "generation_size": 2048,
62
- "stop_sequence": null,
63
- "output_regex": null,
64
- "num_samples": null,
65
- "frozen": false,
66
- "suite": [
67
- "custom"
68
- ],
69
- "original_num_docs": 50,
70
- "effective_num_docs": 50,
71
- "trust_dataset": null,
72
- "must_remove_duplicate_docs": null,
73
- "version": 0
74
- },
75
- "custom|aimo_kaggle_hard_pot:v1": {
76
- "name": "aimo_kaggle_hard_pot:v1",
77
- "prompt_function": "kaggle_hard_pot_prompt_fn_v1",
78
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
79
- "hf_subset": "v0",
80
- "metric": [
81
- "quasi_exact_match_code_and_math"
82
- ],
83
- "hf_avail_splits": [
84
- "train"
85
- ],
86
- "evaluation_splits": [
87
- "train"
88
- ],
89
- "few_shots_split": null,
90
- "few_shots_select": null,
91
- "generation_size": 2048,
92
- "stop_sequence": null,
93
- "output_regex": null,
94
- "num_samples": null,
95
- "frozen": false,
96
- "suite": [
97
- "custom"
98
- ],
99
- "original_num_docs": 50,
100
- "effective_num_docs": 50,
101
- "trust_dataset": null,
102
- "must_remove_duplicate_docs": null,
103
- "version": 0
104
- },
105
- "custom|aimo_kaggle_hard_pot:v2": {
106
- "name": "aimo_kaggle_hard_pot:v2",
107
- "prompt_function": "kaggle_hard_pot_prompt_fn_v2",
108
- "hf_repo": "AI-MO/kaggle-validation-set-hard",
109
- "hf_subset": "v0",
110
- "metric": [
111
- "quasi_exact_match_code_and_math"
112
- ],
113
- "hf_avail_splits": [
114
- "train"
115
- ],
116
- "evaluation_splits": [
117
- "train"
118
- ],
119
- "few_shots_split": null,
120
- "few_shots_select": null,
121
- "generation_size": 2048,
122
- "stop_sequence": null,
123
- "output_regex": null,
124
- "num_samples": null,
125
- "frozen": false,
126
- "suite": [
127
- "custom"
128
- ],
129
- "original_num_docs": 50,
130
- "effective_num_docs": 50,
131
- "trust_dataset": null,
132
- "must_remove_duplicate_docs": null,
133
- "version": 0
134
- }
135
- },
136
- "summary_tasks": {
137
- "custom|aimo_kaggle_hard_pot:v0|0": {
138
- "hashes": {
139
- "hash_examples": "303213a38d9f7512",
140
- "hash_full_prompts": "3c670fa0e80bc301",
141
- "hash_input_tokens": "1bd4187ad6963415",
142
- "hash_cont_tokens": "4a5131314ac71945"
143
- },
144
- "truncated": 50,
145
- "non_truncated": 0,
146
- "padded": 37,
147
- "non_padded": 13,
148
- "effective_few_shots": 0.0,
149
- "num_truncated_few_shots": 0
150
- },
151
- "custom|aimo_kaggle_hard_pot:v1|0": {
152
- "hashes": {
153
- "hash_examples": "e4234b97ad92862f",
154
- "hash_full_prompts": "09e7759a96f64e59",
155
- "hash_input_tokens": "dc7f68cf14be3d61",
156
- "hash_cont_tokens": "835e43e6085c79ec"
157
- },
158
- "truncated": 50,
159
- "non_truncated": 0,
160
- "padded": 32,
161
- "non_padded": 18,
162
- "effective_few_shots": 0.0,
163
- "num_truncated_few_shots": 0
164
- },
165
- "custom|aimo_kaggle_hard_pot:v2|0": {
166
- "hashes": {
167
- "hash_examples": "6396eb8833e13ba0",
168
- "hash_full_prompts": "65f8b95ea99c7087",
169
- "hash_input_tokens": "bddb67459e7601c5",
170
- "hash_cont_tokens": "16c7b73b8b022a8d"
171
- },
172
- "truncated": 50,
173
- "non_truncated": 0,
174
- "padded": 30,
175
- "non_padded": 20,
176
- "effective_few_shots": 0.0,
177
- "num_truncated_few_shots": 0
178
- }
179
- },
180
- "summary_general": {
181
- "hashes": {
182
- "hash_examples": "648c9a107d279e1e",
183
- "hash_full_prompts": "7f4c38eb08b3bb41",
184
- "hash_input_tokens": "16f0fd76e0a2a165",
185
- "hash_cont_tokens": "c21eba382df462d7"
186
- },
187
- "truncated": 150,
188
- "non_truncated": 0,
189
- "padded": 99,
190
- "non_padded": 51,
191
- "num_truncated_few_shots": 0
192
- }
193
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/deepseek-math-7b-rl/main/aimo_kaggle_medium_pot/results_2024-05-27T12-05-24.547168.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 2509401.314190552,
9
- "end_time": 2509704.555886234,
10
- "total_evaluation_time_secondes": "303.2416956820525",
11
- "model_name": "deepseek-ai/deepseek-math-7b-rl",
12
- "model_sha": "f3cd419a172ff5a100e9a563ebfe7900d78e9740",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "12.93 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_medium_pot:v0|0": {
19
- "qem": 0.025,
20
- "qem_stderr": 0.024999999999999998
21
- },
22
- "all": {
23
- "qem": 0.025,
24
- "qem_stderr": 0.024999999999999998
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_medium_pot:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_medium_pot:v0": {
32
- "name": "aimo_kaggle_medium_pot:v0",
33
- "prompt_function": "kaggle_medium_pot_prompt_fn_v0",
34
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_code_and_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 40,
56
- "effective_num_docs": 40,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_medium_pot:v0|0": {
64
- "hashes": {
65
- "hash_examples": "2799c24461029dc3",
66
- "hash_full_prompts": "de6572b914d5bba6",
67
- "hash_input_tokens": "bb5427e5227cc6b4",
68
- "hash_cont_tokens": "38c9a1261865d264"
69
- },
70
- "truncated": 40,
71
- "non_truncated": 0,
72
- "padded": 31,
73
- "non_padded": 9,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "c72202b0a18ef5f9",
81
- "hash_full_prompts": "a0429b0dea3e33db",
82
- "hash_input_tokens": "8e6ef9684e80b71f",
83
- "hash_cont_tokens": "b1bd4ed4824c97cb"
84
- },
85
- "truncated": 40,
86
- "non_truncated": 0,
87
- "padded": 31,
88
- "non_padded": 9,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/deepseek-math-7b-rl/main/aimo_kaggle_medium_pot/results_2024-05-27T12-35-59.406866.json DELETED
@@ -1,91 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 4051098.729740778,
9
- "end_time": 4051401.342947718,
10
- "total_evaluation_time_secondes": "302.61320694023743",
11
- "model_name": "deepseek-ai/deepseek-math-7b-rl",
12
- "model_sha": "f3cd419a172ff5a100e9a563ebfe7900d78e9740",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "12.93 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_medium_pot:v0|0": {
19
- "qem": 0.025,
20
- "qem_stderr": 0.024999999999999998
21
- },
22
- "all": {
23
- "qem": 0.025,
24
- "qem_stderr": 0.024999999999999998
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle_medium_pot:v0|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle_medium_pot:v0": {
32
- "name": "aimo_kaggle_medium_pot:v0",
33
- "prompt_function": "kaggle_medium_pot_prompt_fn_v0",
34
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
35
- "hf_subset": "v0",
36
- "metric": [
37
- "quasi_exact_match_code_and_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "num_samples": null,
51
- "frozen": false,
52
- "suite": [
53
- "custom"
54
- ],
55
- "original_num_docs": 40,
56
- "effective_num_docs": 40,
57
- "trust_dataset": null,
58
- "must_remove_duplicate_docs": null,
59
- "version": 0
60
- }
61
- },
62
- "summary_tasks": {
63
- "custom|aimo_kaggle_medium_pot:v0|0": {
64
- "hashes": {
65
- "hash_examples": "2799c24461029dc3",
66
- "hash_full_prompts": "de6572b914d5bba6",
67
- "hash_input_tokens": "bb5427e5227cc6b4",
68
- "hash_cont_tokens": "38c9a1261865d264"
69
- },
70
- "truncated": 40,
71
- "non_truncated": 0,
72
- "padded": 31,
73
- "non_padded": 9,
74
- "effective_few_shots": 0.0,
75
- "num_truncated_few_shots": 0
76
- }
77
- },
78
- "summary_general": {
79
- "hashes": {
80
- "hash_examples": "c72202b0a18ef5f9",
81
- "hash_full_prompts": "a0429b0dea3e33db",
82
- "hash_input_tokens": "8e6ef9684e80b71f",
83
- "hash_cont_tokens": "b1bd4ed4824c97cb"
84
- },
85
- "truncated": 40,
86
- "non_truncated": 0,
87
- "padded": 31,
88
- "non_padded": 9,
89
- "num_truncated_few_shots": 0
90
- }
91
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/deepseek-math-7b-rl/main/aimo_kaggle_medium_pot/results_2024-05-27T13-07-46.038921.json DELETED
@@ -1,193 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 2111711.970974838,
9
- "end_time": 2112038.728973985,
10
- "total_evaluation_time_secondes": "326.7579991472885",
11
- "model_name": "deepseek-ai/deepseek-math-7b-rl",
12
- "model_sha": "f3cd419a172ff5a100e9a563ebfe7900d78e9740",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "12.93 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle_medium_pot:v0|0": {
19
- "qem": 0.025,
20
- "qem_stderr": 0.024999999999999998
21
- },
22
- "custom|aimo_kaggle_medium_pot:v1|0": {
23
- "qem": 0.0,
24
- "qem_stderr": 0.0
25
- },
26
- "custom|aimo_kaggle_medium_pot:v2|0": {
27
- "qem": 0.125,
28
- "qem_stderr": 0.05295740910852021
29
- },
30
- "custom|aimo_kaggle_medium_pot:_average|0": {
31
- "qem": 0.049999999999999996,
32
- "qem_stderr": 0.025985803036173403
33
- },
34
- "all": {
35
- "qem": 0.049999999999999996,
36
- "qem_stderr": 0.025985803036173403
37
- }
38
- },
39
- "versions": {
40
- "custom|aimo_kaggle_medium_pot:v0|0": 0,
41
- "custom|aimo_kaggle_medium_pot:v1|0": 0,
42
- "custom|aimo_kaggle_medium_pot:v2|0": 0
43
- },
44
- "config_tasks": {
45
- "custom|aimo_kaggle_medium_pot:v0": {
46
- "name": "aimo_kaggle_medium_pot:v0",
47
- "prompt_function": "kaggle_medium_pot_prompt_fn_v0",
48
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
49
- "hf_subset": "v0",
50
- "metric": [
51
- "quasi_exact_match_code_and_math"
52
- ],
53
- "hf_avail_splits": [
54
- "train"
55
- ],
56
- "evaluation_splits": [
57
- "train"
58
- ],
59
- "few_shots_split": null,
60
- "few_shots_select": null,
61
- "generation_size": 2048,
62
- "stop_sequence": null,
63
- "output_regex": null,
64
- "num_samples": null,
65
- "frozen": false,
66
- "suite": [
67
- "custom"
68
- ],
69
- "original_num_docs": 40,
70
- "effective_num_docs": 40,
71
- "trust_dataset": null,
72
- "must_remove_duplicate_docs": null,
73
- "version": 0
74
- },
75
- "custom|aimo_kaggle_medium_pot:v1": {
76
- "name": "aimo_kaggle_medium_pot:v1",
77
- "prompt_function": "kaggle_medium_pot_prompt_fn_v1",
78
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
79
- "hf_subset": "v0",
80
- "metric": [
81
- "quasi_exact_match_code_and_math"
82
- ],
83
- "hf_avail_splits": [
84
- "train"
85
- ],
86
- "evaluation_splits": [
87
- "train"
88
- ],
89
- "few_shots_split": null,
90
- "few_shots_select": null,
91
- "generation_size": 2048,
92
- "stop_sequence": null,
93
- "output_regex": null,
94
- "num_samples": null,
95
- "frozen": false,
96
- "suite": [
97
- "custom"
98
- ],
99
- "original_num_docs": 40,
100
- "effective_num_docs": 40,
101
- "trust_dataset": null,
102
- "must_remove_duplicate_docs": null,
103
- "version": 0
104
- },
105
- "custom|aimo_kaggle_medium_pot:v2": {
106
- "name": "aimo_kaggle_medium_pot:v2",
107
- "prompt_function": "kaggle_medium_pot_prompt_fn_v2",
108
- "hf_repo": "AI-MO/kaggle-validation-set-medium",
109
- "hf_subset": "v0",
110
- "metric": [
111
- "quasi_exact_match_code_and_math"
112
- ],
113
- "hf_avail_splits": [
114
- "train"
115
- ],
116
- "evaluation_splits": [
117
- "train"
118
- ],
119
- "few_shots_split": null,
120
- "few_shots_select": null,
121
- "generation_size": 2048,
122
- "stop_sequence": null,
123
- "output_regex": null,
124
- "num_samples": null,
125
- "frozen": false,
126
- "suite": [
127
- "custom"
128
- ],
129
- "original_num_docs": 40,
130
- "effective_num_docs": 40,
131
- "trust_dataset": null,
132
- "must_remove_duplicate_docs": null,
133
- "version": 0
134
- }
135
- },
136
- "summary_tasks": {
137
- "custom|aimo_kaggle_medium_pot:v0|0": {
138
- "hashes": {
139
- "hash_examples": "2799c24461029dc3",
140
- "hash_full_prompts": "de6572b914d5bba6",
141
- "hash_input_tokens": "5af82ee284ccce29",
142
- "hash_cont_tokens": "72723b206760d825"
143
- },
144
- "truncated": 40,
145
- "non_truncated": 0,
146
- "padded": 29,
147
- "non_padded": 11,
148
- "effective_few_shots": 0.0,
149
- "num_truncated_few_shots": 0
150
- },
151
- "custom|aimo_kaggle_medium_pot:v1|0": {
152
- "hashes": {
153
- "hash_examples": "806b2e2056b41f84",
154
- "hash_full_prompts": "427d5de6e4d90df2",
155
- "hash_input_tokens": "be3dcf9f8a2350d0",
156
- "hash_cont_tokens": "06f12390b65df63f"
157
- },
158
- "truncated": 40,
159
- "non_truncated": 0,
160
- "padded": 26,
161
- "non_padded": 14,
162
- "effective_few_shots": 0.0,
163
- "num_truncated_few_shots": 0
164
- },
165
- "custom|aimo_kaggle_medium_pot:v2|0": {
166
- "hashes": {
167
- "hash_examples": "d8534375acc5d427",
168
- "hash_full_prompts": "6bb90c129d6d6123",
169
- "hash_input_tokens": "22eeaedbeb3f56f0",
170
- "hash_cont_tokens": "b6edaeabcd3b05a5"
171
- },
172
- "truncated": 40,
173
- "non_truncated": 0,
174
- "padded": 31,
175
- "non_padded": 9,
176
- "effective_few_shots": 0.0,
177
- "num_truncated_few_shots": 0
178
- }
179
- },
180
- "summary_general": {
181
- "hashes": {
182
- "hash_examples": "623505a45a4910c2",
183
- "hash_full_prompts": "3aca0fad61a42921",
184
- "hash_input_tokens": "c7d2c8bb76f8ecb1",
185
- "hash_cont_tokens": "9e01c1ecd569a7f7"
186
- },
187
- "truncated": 120,
188
- "non_truncated": 0,
189
- "padded": 86,
190
- "non_padded": 34,
191
- "num_truncated_few_shots": 0
192
- }
193
- }