s1ghhh commited on
Commit
98595da
·
verified ·
1 Parent(s): 390a971

Upload folder using huggingface_hub

Browse files
check_length.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from transformers import AutoTokenizer
3
+
4
+ # 设置模型 tokenizer(例如使用 GPT-2)
5
+ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
6
+
7
+ # 加载 parquet 数据集(假设文件名为 data.parquet)
8
+ dataset = load_dataset("parquet", data_files="/workspace/0525_zyw/verl/counting/mk_data/counting_dataset_qwen25_max2048.parquet")
9
+
10
+ # 选择其中一个 split,例如 'train' 或默认的 'train'
11
+ data = dataset["train"]
12
+
13
+ # 记录超过 2048 tokens 的样本索引和内容
14
+ long_items = []
15
+
16
+ for idx, example in enumerate(data):
17
+ prompt = example.get("prompt", "")
18
+ tokens = tokenizer(prompt, truncation=False, return_tensors="pt")
19
+ input_len = tokens.input_ids.shape[1]
20
+
21
+ if input_len > 2048:
22
+ long_items.append({"index": idx, "length": input_len, "prompt": prompt})
23
+
24
+ print(f"Found {len(long_items)} items with more than 2048 tokens.")
25
+
26
+ # 可选:保存结果到 JSON 文件
27
+ import json
28
+ with open("long_prompts.json", "w", encoding="utf-8") as f:
29
+ json.dump(long_items, f, ensure_ascii=False, indent=2)
get_parquet.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ from datasets import load_dataset
3
+ from transformers import AutoTokenizer
4
+ import re
5
+ from tqdm import tqdm
6
+ import pandas as pd
7
+ import json
8
+ import concurrent.futures
9
+
10
+ # ---------- Helper Functions (Unchanged) ----------
11
+ def extract_think_and_rest(text):
12
+ """提取 <think>...</think> 中的部分和剩余部分"""
13
+ think_blocks = re.findall(r"<think>(.*?)</think>", text, flags=re.DOTALL)
14
+ last_think_end = 0
15
+ for match in re.finditer(r"</think>", text):
16
+ last_think_end = match.end()
17
+ rest_text = text[last_think_end:].strip() if last_think_end else text.strip()
18
+ return think_blocks, rest_text
19
+
20
+ def extract_think_sections(text: str):
21
+ think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
22
+ if think_match:
23
+ think_content = think_match.group(1).strip()
24
+ end_pos = think_match.end()
25
+ post_think_content = text[end_pos:].strip()
26
+ if not think_content: # if <think></think>
27
+ raise ValueError("Empty think block")
28
+ return [think_content], post_think_content
29
+ else:
30
+ raise ValueError("Missing <think> block.")
31
+
32
+ def extract_think_and_solution_V2(text: str):
33
+ pattern = (
34
+ r"<\|begin_of_thought\|>(.*?)<\|end_of_thought\|>\s*"
35
+ r"<\|begin_of_solution\|>(.*?)<\|end_of_solution\|>"
36
+ )
37
+ match = re.search(pattern, text, re.DOTALL)
38
+ if match:
39
+ think_content = match.group(1).strip()
40
+ post_think_content = match.group(2).strip()
41
+ if not think_content:
42
+ raise ValueError("Empty thought block in V2.")
43
+ return [think_content], post_think_content
44
+ else:
45
+ raise ValueError("Missing required <|begin_of_thought|> or <|begin_of_solution|> blocks.")
46
+
47
+ # ---------- Worker function for multithreading (Unchanged) ----------
48
+ def process_single_item(args):
49
+ item, dataset_name, tokenizer, format_tokenizer, max_prompt_token_len_config = args
50
+ try:
51
+ if dataset_name == "OpenR1-Math-220k":
52
+ problem = item["problem"].strip()
53
+ response_full = item["generations"][0].strip()
54
+ reasoning_blocks, answer = extract_think_sections(response_full)
55
+ elif dataset_name == "OpenThoughts-114k-math":
56
+ problem = item["problem"].strip()
57
+ response_full = item["conversations"][1]["value"].strip()
58
+ reasoning_blocks, answer = extract_think_and_solution_V2(response_full)
59
+ elif dataset_name == "reasoning-v1-20m":
60
+ problem = item.get("prompt", "").strip()
61
+ response_full = item.get("response", "").strip()
62
+ reasoning_blocks, answer = extract_think_sections(response_full)
63
+ elif dataset_name == "OpenThoughts-114k-Code_decontaminated":
64
+ problem = item["problem"].strip()
65
+ # response_full = item.get("response", "").strip()
66
+ # reasoning_blocks, answer = extract_think_sections(response_full)
67
+ reasoning_blocks = [item["deepseek_reasoning"]]
68
+ answer = item["deepseek_solution"]
69
+ elif dataset_name == "Medical-R1-Distill-Data":
70
+ problem = item["question"].strip()
71
+ # response_full = item.get("response", "").strip()
72
+ # reasoning_blocks, answer = extract_think_sections(response_full)
73
+ reasoning_blocks = [item["reasoning (reasoning_content)"]]
74
+ answer = item["response (content)"]
75
+ else:
76
+ return None
77
+
78
+ if not reasoning_blocks or not reasoning_blocks[0]:
79
+ return None
80
+
81
+ reasoning = reasoning_blocks[0].strip()
82
+ solution = answer.strip()
83
+
84
+ input_token_count = len(tokenizer.tokenize(problem))
85
+ output_token_count = len(tokenizer.tokenize(solution))
86
+ reasoning_token_count = len(tokenizer.tokenize(reasoning))
87
+
88
+ instruct_info = (
89
+ "Given a <Problem> and its corresponding <Solution>, your task is to predict how many tokens are consumed in the process of arriving at the final <Solution> to the problem. Generally speaking, the more complex the problem is, the more tokens are required.\n"
90
+ f"<Problem>\n{problem}\n</Problem>\n\n"
91
+ f"<Solution>\n{solution}\n</Solution>\n"
92
+ f"The Problem has {input_token_count} tokens, and the Solution has {output_token_count} tokens.\n\n\n"
93
+ "Please provide a detailed chain-of-thought reasoning process and include your thought process within <think> tags. "
94
+ "Your final answer should be enclosed within <answer> tags.\n\n"
95
+ "Please return the predicted number of tokens in JSON format: \n```json\n{\"count\": int}\n```\n\n"
96
+ "Example format:\n"
97
+ "<think> Step-by-step reasoning, including self-reflection and corrections if necessary. [Limited by 1024 tokens] </think>\n"
98
+ "<answer> Summary of the thought process leading to the final token count and your predicted token count in json format: \n```json\n{\"count\": int}\n```\n [Limited by 512 tokens]\n"
99
+ "</answer>\n\n"
100
+ "Let me solve this step by step.\n"
101
+ )
102
+
103
+ cot_info = "<think>"
104
+
105
+ messages = [
106
+ {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
107
+ {"role": "user", "content": instruct_info.strip()},
108
+ ]
109
+
110
+ prompt = format_tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
111
+ prompt += cot_info
112
+
113
+ prompt_token_len = len(format_tokenizer.tokenize(prompt))
114
+
115
+ if prompt_token_len <= max_prompt_token_len_config - 10:
116
+ return {
117
+ "prompt": prompt,
118
+ "ground_truth": reasoning_token_count,
119
+ "data_source": dataset_name,
120
+ "prompt_token_len": prompt_token_len,
121
+ }
122
+ return None
123
+ except ValueError:
124
+ return None
125
+ except Exception as e:
126
+ print(f"Error processing item for {dataset_name}: {e}")
127
+ return None
128
+
129
+ # ---------- 参数设置 (Unchanged for original purpose, but test_size's role in splitting changes) ----------
130
+ # train_size = 100000 # This now primarily influences num_items_to_sample_raw
131
+ test_size = 1000 # This now primarily influences num_items_to_sample_raw; actual test split is fixed
132
+ max_prompt_token_len = 4096
133
+ random_seed = 42 # This is the fixed random seed
134
+ NUM_THREADS = 16
135
+
136
+ # Set the fixed random seed for Python's `random` module
137
+ # This will affect `random.sample` used for splitting
138
+ random.seed(random_seed)
139
+
140
+ # ---------- 加载并打乱数据集 (Logic largely unchanged, seed is used by dataset.shuffle) ----------
141
+ datasets_config = {
142
+ # "OpenR1-Math-220k": "/workspace/0407_nips/data_preprocess/OpenR1-Math-220k/data",
143
+ # "reasoning-v1-20m": "/workspace/0407_nips/data_preprocess/reasoning-v1-20m/data",
144
+ # "OpenThoughts-114k-math": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-math/data",
145
+ "OpenThoughts-114k-Code_decontaminated": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-Code_decontaminated/data",
146
+ # "Medical-R1-Distill-Data": "/workspace/0407_nips/data_preprocess/Medical-R1-Distill-Data"
147
+ }
148
+
149
+ for name, path in datasets_config.items():
150
+ print(f"Processing dataset: {name}")
151
+ try:
152
+ dataset_hf = load_dataset(path, trust_remote_code=True)["train"]
153
+ except Exception as e:
154
+ print(f"Error loading dataset {name} from {path}: {e}")
155
+ continue
156
+ print(len(dataset_hf))
157
+ # num_items_to_sample_raw = train_size + test_size + 1000
158
+
159
+ # actual_num_to_sample = min(num_items_to_sample_raw, len(dataset_hf))
160
+ # if actual_num_to_sample < num_items_to_sample_raw:
161
+ # print(f"Warning: Dataset {name} has only {len(dataset_hf)} items. Sampling {actual_num_to_sample} instead of {num_items_to_sample_raw}.")
162
+
163
+ # if actual_num_to_sample == 0:
164
+ # print(f"Skipping dataset {name} as it has no items or actual_num_to_sample is 0.")
165
+ # continue
166
+
167
+ # # Shuffling raw dataset with the fixed seed
168
+ # dataset_selected = dataset_hf.shuffle(seed=random_seed).select(range(actual_num_to_sample))
169
+ # .select(range(102000))
170
+ dataset_selected = dataset_hf.shuffle(seed=random_seed)
171
+
172
+ tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
173
+ format_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
174
+
175
+ processed_item_data_list = []
176
+ tasks_args_list = []
177
+
178
+ print(f"Preparing tasks for {name}...")
179
+ for item in dataset_selected:
180
+ tasks_args_list.append((item, name, tokenizer, format_tokenizer, max_prompt_token_len))
181
+
182
+ print(f"Submitting {len(tasks_args_list)} tasks to thread pool for {name}...")
183
+ with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
184
+ future_to_item_args = {executor.submit(process_single_item, args): args for args in tasks_args_list}
185
+
186
+ for future in tqdm(concurrent.futures.as_completed(future_to_item_args), total=len(tasks_args_list), desc=f"Processing items for {name}"):
187
+ try:
188
+ result = future.result()
189
+ if result:
190
+ processed_item_data_list.append(result)
191
+ except Exception as exc:
192
+ # item_arg_tuple = future_to_item_args[future] # Uncomment if needed for debugging
193
+ print(f'Item generated an exception during future.result(): {exc}')
194
+
195
+
196
+ records = []
197
+ count = 0
198
+ print(f"Collected {len(processed_item_data_list)} valid processed items for {name}. Assigning IDs...")
199
+ # The original code had a cap here. We keep it.
200
+ # This `num_items_to_sample_raw` acts as an upper limit on total records considered for splitting.
201
+ for item_data in processed_item_data_list:
202
+ # if len(records) >= num_items_to_sample_raw:
203
+ # break
204
+ item_data["ids"] = f"{name}_{count}"
205
+ records.append(item_data)
206
+ count += 1
207
+
208
+ if not records:
209
+ print(f"No valid records generated for dataset {name} after filtering and ID assignment. Skipping saving.")
210
+ continue
211
+
212
+ # ---------- MODIFIED: 拆分训练集和测试集 ----------
213
+ # Test set is fixed at 1000 (or fewer if not enough data)
214
+ # Training set is everything else.
215
+ # random_seed is already set globally for `random` module.
216
+
217
+ target_test_set_size = 1000 # Your requirement
218
+ num_available_records = len(records)
219
+
220
+ train_records = []
221
+ test_records = []
222
+
223
+ if num_available_records == 0:
224
+ print(f"No records available for splitting for {name}.")
225
+ elif num_available_records <= target_test_set_size:
226
+ # If we have 1000 or fewer records, all go to test set, train is empty
227
+ print(f"Warning: Only {num_available_records} records available for {name}. All will be used for the test set.")
228
+ test_records = list(records) # Make a copy
229
+ train_records = []
230
+ else:
231
+ # We have more than 1000 records. Sample 1000 for test set.
232
+ # `random.sample` uses the seed set by `random.seed(random_seed)`
233
+ test_indices = sorted(random.sample(range(num_available_records), target_test_set_size))
234
+
235
+ current_test_idx_ptr = 0
236
+ for i in range(num_available_records):
237
+ if current_test_idx_ptr < len(test_indices) and i == test_indices[current_test_idx_ptr]:
238
+ test_records.append(records[i])
239
+ current_test_idx_ptr += 1
240
+ else:
241
+ train_records.append(records[i])
242
+
243
+ # Sanity check
244
+ if len(test_records) != target_test_set_size:
245
+ print(f"Error: Test set size mismatch. Expected {target_test_set_size}, got {len(test_records)}")
246
+ if len(train_records) != num_available_records - target_test_set_size:
247
+ print(f"Error: Train set size mismatch. Expected {num_available_records - target_test_set_size}, got {len(train_records)}")
248
+
249
+
250
+ # ---------- 保存 (Unchanged other than variable names if needed) ----------
251
+ if train_records:
252
+ df_train = pd.DataFrame(train_records)
253
+ df_train.to_parquet(f"v2_train_counting_dataset_{name}_{len(df_train)}.parquet", index=False)
254
+ else:
255
+ print(f"No training records to save for {name}.")
256
+
257
+ if test_records:
258
+ df_test = pd.DataFrame(test_records)
259
+ df_test.to_parquet(f"v2_test_counting_dataset_{name}_{len(df_test)}.parquet", index=False)
260
+ else:
261
+ print(f"No test records to save for {name}.")
262
+
263
+ print(f"✅ Successfully processed dataset {name}")
264
+ print(f" Saved Train samples: {len(train_records)}, Test samples: {len(test_records)}")
265
+ print("-" * 30)
266
+
267
+ print("All datasets processed.")
get_parquet_single.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ from datasets import load_dataset
3
+ from transformers import AutoTokenizer
4
+ import re
5
+ from tqdm import tqdm
6
+ import pandas as pd
7
+ import json
8
+
9
+ def extract_think_and_rest(text):
10
+ """提取 <think>...</think> 中的部分和剩余部分"""
11
+ think_blocks = re.findall(r"<think>(.*?)</think>", text, flags=re.DOTALL)
12
+ last_think_end = 0
13
+ for match in re.finditer(r"</think>", text):
14
+ last_think_end = match.end()
15
+ rest_text = text[last_think_end:].strip() if last_think_end else text.strip()
16
+ return think_blocks, rest_text
17
+
18
+ def extract_think_sections(text: str):
19
+ think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
20
+ if think_match:
21
+ think_content = think_match.group(1).strip()
22
+ end_pos = think_match.end()
23
+ post_think_content = text[end_pos:].strip()
24
+ else:
25
+ think_content = None
26
+ post_think_content = text.strip()
27
+ raise ValueError
28
+ return think_content, post_think_content
29
+
30
+
31
+ def extract_think_and_solution_V2(text: str):
32
+ pattern = (
33
+ r"<\|begin_of_thought\|>(.*?)<\|end_of_thought\|>\s*"
34
+ r"<\|begin_of_solution\|>(.*?)<\|end_of_solution\|>"
35
+ )
36
+ match = re.search(pattern, text, re.DOTALL)
37
+ if match:
38
+ think_content = match.group(1).strip()
39
+ post_think_content = match.group(2).strip()
40
+ else:
41
+ think_content = None
42
+ post_think_content = text.strip()
43
+ raise ValueError("Missing required <|begin_of_thought|> or <|begin_of_solution|> blocks.")
44
+ return think_content, post_think_content
45
+
46
+
47
+ # ---------- 参数设置 ----------
48
+ train_size = 110000 # 训练集样本数量
49
+ test_size = 1000 # 测试集样本数量
50
+ max_prompt_token_len = 2048
51
+ random_seed = 42
52
+ num_workers = 16
53
+ # ---------- 加载并打乱数据集 ----------
54
+
55
+ datasets_config = {
56
+ "OpenR1-Math-220k": "/workspace/0407_nips/data_preprocess/OpenR1-Math-220k/data",
57
+ # "reasoning-v1-20m": "/workspace/0407_nips/data_preprocess/reasoning-v1-20m/data",
58
+ "OpenThoughts-114k-math": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-math/data",
59
+ # "OpenThoughts-114k-Code_decontaminated": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-Code_decontaminated/data",
60
+ # "Medical-R1-Distill-Data": "/workspace/0407_nips/data_preprocess/Medical-R1-Distill-Data"
61
+ }
62
+
63
+ for name, path in datasets_config.items():
64
+ print(f"{name}")
65
+ dataset = load_dataset(path)["train"]
66
+ # shuffled_dataset = dataset.shuffle(seed=42)
67
+
68
+ # dataset = load_dataset('/workspace/0407_nips/data_preprocess/reasoning-v1-20m/data')["train"]
69
+ total_size = train_size + test_size + 1000
70
+ dataset = dataset.shuffle(seed=random_seed).select(range(total_size))
71
+
72
+ # ---------- 初始化 Tokenizer ----------
73
+ tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
74
+ format_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
75
+
76
+ records = []
77
+ count = 0
78
+
79
+ # ---------- 处理样本 ----------
80
+ for item in tqdm(dataset, desc="Processing samples"):
81
+ if len(records) >= total_size:
82
+ break
83
+ if name == "OpenR1-Math-220k":
84
+ problem = item["problem"].strip()
85
+ response_full = item["generations"][0].strip()
86
+
87
+ reasoning_blocks, answer = extract_think_sections(response_full)
88
+ if not reasoning_blocks:
89
+ continue
90
+ reasoning = reasoning_blocks[0].strip()
91
+ solution = answer.strip()
92
+ elif name == "OpenThoughts-114k-math":
93
+ problem = item["problem"].strip()
94
+ response_full = item["conversations"][1]["value"].strip()
95
+
96
+ reasoning_blocks, answer = extract_think_and_solution_V2(response_full)
97
+ if not reasoning_blocks:
98
+ continue
99
+ reasoning = reasoning_blocks[0].strip()
100
+ solution = answer.strip()
101
+ elif name == "reasoning-v1-20m":
102
+ problem = item.get("prompt", "").strip()
103
+ response_full = item.get("response", "").strip()
104
+
105
+ reasoning_blocks, answer = extract_think_sections(response_full)
106
+ if not reasoning_blocks:
107
+ continue
108
+ reasoning = reasoning_blocks[0].strip()
109
+ solution = answer.strip()
110
+
111
+ input_token_count = len(tokenizer.tokenize(problem))
112
+ output_token_count = len(tokenizer.tokenize(solution))
113
+ reasoning_token_count = len(tokenizer.tokenize(reasoning))
114
+
115
+ instruct_info = (
116
+ "Your task is to estimate how many tokens a reasoning model would use to solve the following Problem and Solution.\n"
117
+ "Please return the predicted number of tokens in JSON format: ```json\n{\"count\": int}\n```\n\n"
118
+ f"<Problem>\n{problem}\n</Problem>\n\n"
119
+ f"<Solution>\n{solution}\n</Solution>\n"
120
+ f"The Problem has {input_token_count} tokens, and the Solution has {output_token_count} tokens.\n\n\n"
121
+ "Please provide a detailed chain-of-thought reasoning process and include your thought process within `<think>` tags. "
122
+ "Your final answer should be enclosed within `<answer>` tags.\n\n"
123
+ "Ensure that your counting logic is sound and that your explanation is token-efficient.\n\n"
124
+ "Example format:\n"
125
+ "<think> Step-by-step reasoning, including self-reflection and corrections if necessary. [Limited by 1024 tokens] </think>\n"
126
+ "<answer> Summary of the thought process leading to the final token count and your predicted token count in json format: ```json\n{\"count\": int}\n``` [Limited by 512 tokens]\n"
127
+ "</answer>\n\n"
128
+ )
129
+
130
+ cot_info = "Let me solve this step by step.\n"
131
+
132
+ messages = [
133
+ {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
134
+ {"role": "user", "content": instruct_info.strip()},
135
+ ]
136
+
137
+ prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
138
+ prompt += cot_info
139
+
140
+ prompt_token_len = len(format_tokenizer.tokenize(prompt))
141
+
142
+ if prompt_token_len <= max_prompt_token_len - 10:
143
+ records.append({
144
+ "prompt": prompt,
145
+ "ground_truth": reasoning_token_count,
146
+ "data_source": name,
147
+ "ids": f"{name}_{count}",
148
+ "prompt_token_len": prompt_token_len,
149
+ })
150
+ count += 1
151
+
152
+ if len(records) >= total_size:
153
+ break
154
+
155
+ # ---------- 拆分训练集和测试集 ----------
156
+ train_records = records[:train_size]
157
+ test_records = records[train_size:train_size + test_size]
158
+
159
+ # ---------- 保存 ----------
160
+ df_train = pd.DataFrame(train_records)
161
+ df_test = pd.DataFrame(test_records)
162
+
163
+ df_train.to_parquet(f"train_counting_dataset_{name}_{train_size}.parquet", index=False)
164
+ df_test.to_parquet(f"test_counting_dataset_{name}_{test_size}.parquet", index=False)
165
+
166
+ print("✅ 成功生成训练和测试数据集")
167
+ print(f"Train samples: {len(df_train)}, Test samples: {len(df_test)}")
load_check.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import pandas as pd
3
+
4
+ # 显示所有列,不省略
5
+ pd.set_option('display.max_columns', None)
6
+
7
+ # 显示所有行(对 head() 没影响,但可用于 df 全体显示时)
8
+ pd.set_option('display.max_rows', None)
9
+
10
+ # 不截断列内容,显示完整字符串
11
+ pd.set_option('display.max_colwidth', None)
12
+
13
+ # 设置最大宽度,防止自动换行
14
+ pd.set_option('display.width', 1000)
15
+ # 读取 parquet 文件(自动识别使用的 engine,比如 pyarrow 或 fastparquet)
16
+ df = pd.read_parquet("/workspace/0525_zyw/verl/counting/mk_data/v2_train_counting_dataset_OpenR1-Math-220k_90258.parquet", engine="auto")
17
+
18
+ print(df.shape)
19
+ # print(df.columns)
20
+ # # 查看前几条数据
21
+ # print(df.loc[0]) # 默认前 5 条,可以传入参数指定行数,比如 df.head(10)
22
+ # for i, row in df.iterrows():
23
+ # print(f"Row {i}:\n{row['prompt']}\n---")
24
+ # if i > 1: # 只打印前 5 行
25
+ # break
long_prompts.json ADDED
@@ -0,0 +1 @@
 
 
1
+ []
merge_train.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import glob
3
+ import random
4
+
5
+ random_seed = 42
6
+ sample_size = 15000
7
+
8
+ # 1. 找到所有 parquet 文件
9
+ parquet_files = glob.glob("v2_train_counting_dataset_*.parquet")
10
+
11
+ selected_parquet_files = []
12
+ for parquet_file in parquet_files:
13
+ if "v2_train_counting_dataset_OpenThoughts-114k-math_88120.parquet" not in parquet_file:
14
+ selected_parquet_files.append(parquet_file)
15
+ print("找到的parquet文件:", selected_parquet_files)
16
+
17
+ # 2. 合并所有数据
18
+ all_data = []
19
+ for file in selected_parquet_files:
20
+ print(file)
21
+ df = pd.read_parquet(file)
22
+ all_data.append(df)
23
+ df_all = pd.concat(all_data, ignore_index=True)
24
+ print("合并后总数据量:", len(df_all))
25
+
26
+ # 3. 按 data_source 分组,每组采样 25k
27
+ sampled_dfs = []
28
+ for name, group in df_all.groupby("data_source"):
29
+ if len(group) > sample_size:
30
+ sampled = group.sample(n=sample_size, random_state=random_seed)
31
+ else:
32
+ sampled = group
33
+ sampled_dfs.append(sampled)
34
+ print(f"{name}: 原始{len(group)}条,采样{len(sampled)}条")
35
+
36
+ # 4. 合并采样后的数据
37
+ df_sampled = pd.concat(sampled_dfs, ignore_index=True)
38
+ print("采样后总数据量:", len(df_sampled))
39
+
40
+ shuffled_df = df_sampled.sample(frac=1, random_state=random_seed).reset_index(drop=True)
41
+
42
+ # 5. 保存
43
+ shuffled_df.to_parquet("merged_sampled_4datasets_15k_each.parquet", index=False)
44
+ print("已保存到 merged_sampled_4datasets_15k_each.parquet")
v2_test_counting_dataset_Medical-R1-Distill-Data_1000.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89bc5389f4e66b52170d2e0c9922171cb7f63a679e4520ab99209bafda362819
3
+ size 1079992
v2_test_counting_dataset_OpenR1-Math-220k_1000.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2454554feb69d07e233ef4a3f40f76d9bf537ed052e8bbc7416c420b4dc92b3d
3
+ size 952195
v2_test_counting_dataset_OpenThoughts-114k-Code_decontaminated_1000.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:beda857c76c38d5a06916b487bafb2cffb76cb6fc31b6fe89698a6f54f8bcf2d
3
+ size 2202832
v2_test_counting_dataset_OpenThoughts-114k-math_1000.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90069eb9de435395355312e63597e014d3801b293e9bf5e2d9263ced6a67d19c
3
+ size 998292
v2_test_counting_dataset_reasoning-v1-20m_1000.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b98f56c5005e65956ac406021f6e9a98744da8aaec748ddf4e7086970e3cdc2
3
+ size 1665071
v2_train_counting_dataset_Medical-R1-Distill-Data_21000.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c1c02b93013d57b2dd8c874a13a86e2804b9e25931903945dd93bf0606312fb
3
+ size 21995473
v2_train_counting_dataset_OpenR1-Math-220k_90258.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b940e63fcaed0d5eff15e40123e63b1dad99ccf9e37071c7d8a308e760e90db9
3
+ size 84122607
v2_train_counting_dataset_OpenThoughts-114k-Code_decontaminated_15372.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18a52d5aa00a9140ca4c0c20a03f8ec7edac9ffc62eb02036ecef0c0c7e12eb1
3
+ size 33672180
v2_train_counting_dataset_OpenThoughts-114k-math_88120.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a1b7889afc082e16c4e70524a67f99784b825e7a799cb9314ba1fae32688a2b
3
+ size 85576595
v2_train_counting_dataset_reasoning-v1-20m_100982.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7808bc50c3f2edf5d4d80aac04cdfb13820d1b5f65d85e17ae6f00d680ac4466
3
+ size 166027524