lewtun HF Staff commited on
Commit
f5a9775
·
1 Parent(s): 6ef6765

Delete aime25 part 1

Browse files
eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/main/aime25_part1/results_2025-02-10T14-41-42.284958.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 522211.976529077,
9
- "end_time": 523789.472750289,
10
- "total_evaluation_time_secondes": "1577.49622121203",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|aime25:part1|0": {
18
- "extractive_match": 0.4,
19
- "extractive_match_stderr": 0.13093073414159542
20
- },
21
- "all": {
22
- "extractive_match": 0.4,
23
- "extractive_match_stderr": 0.13093073414159542
24
- }
25
- },
26
- "versions": {
27
- "custom|aime25:part1|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|aime25:part1": {
31
- "name": "aime25:part1",
32
- "prompt_function": "aime_prompt_fn",
33
- "hf_repo": "open-r1/aime_2025_1",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "train"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "train"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 15,
64
- "effective_num_docs": 15,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|aime25:part1|0": {
71
- "hashes": {
72
- "hash_examples": "fefe56b3589ab695",
73
- "hash_full_prompts": "05f1eb7c44368466",
74
- "hash_input_tokens": "b4165deea4738907",
75
- "hash_cont_tokens": "1d4a0e53cd074474"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 15,
79
- "padded": 0,
80
- "non_padded": 15,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "b2c4142b01e3f1a2",
88
- "hash_full_prompts": "60294767469e4aa7",
89
- "hash_input_tokens": "7b51eefd6817a0f4",
90
- "hash_cont_tokens": "769b9eb7d37aa3f0"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 15,
94
- "padded": 0,
95
- "non_padded": 15,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/main/aime25_part1/results_2025-02-10T14-25-44.819964.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 492050.107517627,
9
- "end_time": 492473.986006278,
10
- "total_evaluation_time_secondes": "423.87848865101114",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|aime25:part1|0": {
18
- "extractive_match": 0.26666666666666666,
19
- "extractive_match_stderr": 0.11818736805705578
20
- },
21
- "all": {
22
- "extractive_match": 0.26666666666666666,
23
- "extractive_match_stderr": 0.11818736805705578
24
- }
25
- },
26
- "versions": {
27
- "custom|aime25:part1|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|aime25:part1": {
31
- "name": "aime25:part1",
32
- "prompt_function": "aime_prompt_fn",
33
- "hf_repo": "open-r1/aime_2025_1",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "train"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "train"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 15,
64
- "effective_num_docs": 15,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|aime25:part1|0": {
71
- "hashes": {
72
- "hash_examples": "fefe56b3589ab695",
73
- "hash_full_prompts": "05f1eb7c44368466",
74
- "hash_input_tokens": "b4165deea4738907",
75
- "hash_cont_tokens": "028bb3d2bab7de44"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 15,
79
- "padded": 0,
80
- "non_padded": 15,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "b2c4142b01e3f1a2",
88
- "hash_full_prompts": "60294767469e4aa7",
89
- "hash_input_tokens": "7b51eefd6817a0f4",
90
- "hash_cont_tokens": "4c911cd98966edfd"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 15,
94
- "padded": 0,
95
- "non_padded": 15,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/main/aime25_part1/results_2025-02-10T14-26-33.362343.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 492078.59228321,
9
- "end_time": 492331.233111113,
10
- "total_evaluation_time_secondes": "252.64082790300017",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|aime25:part1|0": {
18
- "extractive_match": 0.2,
19
- "extractive_match_stderr": 0.10690449676496976
20
- },
21
- "all": {
22
- "extractive_match": 0.2,
23
- "extractive_match_stderr": 0.10690449676496976
24
- }
25
- },
26
- "versions": {
27
- "custom|aime25:part1|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|aime25:part1": {
31
- "name": "aime25:part1",
32
- "prompt_function": "aime_prompt_fn",
33
- "hf_repo": "open-r1/aime_2025_1",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "train"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "train"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 15,
64
- "effective_num_docs": 15,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|aime25:part1|0": {
71
- "hashes": {
72
- "hash_examples": "fefe56b3589ab695",
73
- "hash_full_prompts": "05f1eb7c44368466",
74
- "hash_input_tokens": "139d602d169004c6",
75
- "hash_cont_tokens": "8ae57e9bd06056f8"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 15,
79
- "padded": 0,
80
- "non_padded": 15,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "b2c4142b01e3f1a2",
88
- "hash_full_prompts": "60294767469e4aa7",
89
- "hash_input_tokens": "ed4de176b6678e94",
90
- "hash_cont_tokens": "f7d7b76d41d1ebdf"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 15,
94
- "padded": 0,
95
- "non_padded": 15,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/main/aime25_part1/results_2025-02-10T14-34-21.685638.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 440233.623058309,
9
- "end_time": 441323.005363222,
10
- "total_evaluation_time_secondes": "1089.382304912957",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|aime25:part1|0": {
18
- "extractive_match": 0.6,
19
- "extractive_match_stderr": 0.13093073414159542
20
- },
21
- "all": {
22
- "extractive_match": 0.6,
23
- "extractive_match_stderr": 0.13093073414159542
24
- }
25
- },
26
- "versions": {
27
- "custom|aime25:part1|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|aime25:part1": {
31
- "name": "aime25:part1",
32
- "prompt_function": "aime_prompt_fn",
33
- "hf_repo": "open-r1/aime_2025_1",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "train"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "train"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 15,
64
- "effective_num_docs": 15,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|aime25:part1|0": {
71
- "hashes": {
72
- "hash_examples": "fefe56b3589ab695",
73
- "hash_full_prompts": "05f1eb7c44368466",
74
- "hash_input_tokens": "139d602d169004c6",
75
- "hash_cont_tokens": "b91860e1c754fd67"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 15,
79
- "padded": 0,
80
- "non_padded": 15,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "b2c4142b01e3f1a2",
88
- "hash_full_prompts": "60294767469e4aa7",
89
- "hash_input_tokens": "ed4de176b6678e94",
90
- "hash_cont_tokens": "140ac5fba92f4e60"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 15,
94
- "padded": 0,
95
- "non_padded": 15,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/main/aime25_part1/results_2025-02-10T14-29-17.103312.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 512657.395598037,
9
- "end_time": 513085.318624264,
10
- "total_evaluation_time_secondes": "427.9230262269848",
11
- "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|aime25:part1|0": {
18
- "extractive_match": 0.4,
19
- "extractive_match_stderr": 0.13093073414159542
20
- },
21
- "all": {
22
- "extractive_match": 0.4,
23
- "extractive_match_stderr": 0.13093073414159542
24
- }
25
- },
26
- "versions": {
27
- "custom|aime25:part1|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|aime25:part1": {
31
- "name": "aime25:part1",
32
- "prompt_function": "aime_prompt_fn",
33
- "hf_repo": "open-r1/aime_2025_1",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "train"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "train"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 15,
64
- "effective_num_docs": 15,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|aime25:part1|0": {
71
- "hashes": {
72
- "hash_examples": "fefe56b3589ab695",
73
- "hash_full_prompts": "05f1eb7c44368466",
74
- "hash_input_tokens": "139d602d169004c6",
75
- "hash_cont_tokens": "65053f5b67059a4f"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 15,
79
- "padded": 0,
80
- "non_padded": 15,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "b2c4142b01e3f1a2",
88
- "hash_full_prompts": "60294767469e4aa7",
89
- "hash_input_tokens": "ed4de176b6678e94",
90
- "hash_cont_tokens": "9a883faafc8e6ad9"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 15,
94
- "padded": 0,
95
- "non_padded": 15,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-r1/R2-Q7B-GR1-ALL-s1k-5e-5-weight-decay-1e-4/main/aime25_part1/results_2025-02-10T14-41-19.678571.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 492993.939085534,
9
- "end_time": 493408.845872955,
10
- "total_evaluation_time_secondes": "414.90678742097225",
11
- "model_name": "open-r1/R2-Q7B-GR1-ALL-s1k-5e-5-weight-decay-1e-4",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|aime25:part1|0": {
18
- "extractive_match": 0.4,
19
- "extractive_match_stderr": 0.13093073414159542
20
- },
21
- "all": {
22
- "extractive_match": 0.4,
23
- "extractive_match_stderr": 0.13093073414159542
24
- }
25
- },
26
- "versions": {
27
- "custom|aime25:part1|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|aime25:part1": {
31
- "name": "aime25:part1",
32
- "prompt_function": "aime_prompt_fn",
33
- "hf_repo": "open-r1/aime_2025_1",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "train"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "train"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 15,
64
- "effective_num_docs": 15,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|aime25:part1|0": {
71
- "hashes": {
72
- "hash_examples": "fefe56b3589ab695",
73
- "hash_full_prompts": "1422d222a53ed984",
74
- "hash_input_tokens": "7c22dfb269889f8d",
75
- "hash_cont_tokens": "22eed918963b518b"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 15,
79
- "padded": 0,
80
- "non_padded": 15,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "b2c4142b01e3f1a2",
88
- "hash_full_prompts": "23f99b9ce7f7477f",
89
- "hash_input_tokens": "52c835b66393d8ca",
90
- "hash_cont_tokens": "c0b186deac812972"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 15,
94
- "padded": 0,
95
- "non_padded": 15,
96
- "num_truncated_few_shots": 0
97
- }
98
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/open-thoughts/OpenThinker-7B/main/aime25_part1/results_2025-02-10T14-42-45.258313.json DELETED
@@ -1,98 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": -1,
6
- "max_samples": null,
7
- "job_id": 0,
8
- "start_time": 492559.690834334,
9
- "end_time": 493011.594694595,
10
- "total_evaluation_time_secondes": "451.9038602610235",
11
- "model_name": "open-thoughts/OpenThinker-7B",
12
- "model_sha": "",
13
- "model_dtype": null,
14
- "model_size": null
15
- },
16
- "results": {
17
- "custom|aime25:part1|0": {
18
- "extractive_match": 0.3333333333333333,
19
- "extractive_match_stderr": 0.12598815766974242
20
- },
21
- "all": {
22
- "extractive_match": 0.3333333333333333,
23
- "extractive_match_stderr": 0.12598815766974242
24
- }
25
- },
26
- "versions": {
27
- "custom|aime25:part1|0": 1
28
- },
29
- "config_tasks": {
30
- "custom|aime25:part1": {
31
- "name": "aime25:part1",
32
- "prompt_function": "aime_prompt_fn",
33
- "hf_repo": "open-r1/aime_2025_1",
34
- "hf_subset": "default",
35
- "metric": [
36
- {
37
- "metric_name": "extractive_match",
38
- "higher_is_better": true,
39
- "category": "3",
40
- "use_case": "1",
41
- "sample_level_fn": "sample_level_fn",
42
- "corpus_level_fn": "mean"
43
- }
44
- ],
45
- "hf_revision": null,
46
- "hf_filter": null,
47
- "hf_avail_splits": [
48
- "train"
49
- ],
50
- "trust_dataset": false,
51
- "evaluation_splits": [
52
- "train"
53
- ],
54
- "few_shots_split": null,
55
- "few_shots_select": null,
56
- "generation_size": 32768,
57
- "generation_grammar": null,
58
- "stop_sequence": [],
59
- "num_samples": null,
60
- "suite": [
61
- "custom"
62
- ],
63
- "original_num_docs": 15,
64
- "effective_num_docs": 15,
65
- "must_remove_duplicate_docs": false,
66
- "version": 1
67
- }
68
- },
69
- "summary_tasks": {
70
- "custom|aime25:part1|0": {
71
- "hashes": {
72
- "hash_examples": "fefe56b3589ab695",
73
- "hash_full_prompts": "6c368cbe5f6966ea",
74
- "hash_input_tokens": "557da98a930139d3",
75
- "hash_cont_tokens": "7e747dfb29323b6d"
76
- },
77
- "truncated": 0,
78
- "non_truncated": 15,
79
- "padded": 0,
80
- "non_padded": 15,
81
- "effective_few_shots": 0.0,
82
- "num_truncated_few_shots": 0
83
- }
84
- },
85
- "summary_general": {
86
- "hashes": {
87
- "hash_examples": "b2c4142b01e3f1a2",
88
- "hash_full_prompts": "f6f9b0b47aaa5ecf",
89
- "hash_input_tokens": "3b379384f2f8b84e",
90
- "hash_cont_tokens": "a242afece88f91fd"
91
- },
92
- "truncated": 0,
93
- "non_truncated": 15,
94
- "padded": 0,
95
- "non_padded": 15,
96
- "num_truncated_few_shots": 0
97
- }
98
- }