Upload folder using huggingface_hub
Browse files- check_length.py +29 -0
- get_parquet.py +267 -0
- get_parquet_single.py +167 -0
- load_check.py +25 -0
- long_prompts.json +1 -0
- merge_train.py +44 -0
- v2_test_counting_dataset_Medical-R1-Distill-Data_1000.parquet +3 -0
- v2_test_counting_dataset_OpenR1-Math-220k_1000.parquet +3 -0
- v2_test_counting_dataset_OpenThoughts-114k-Code_decontaminated_1000.parquet +3 -0
- v2_test_counting_dataset_OpenThoughts-114k-math_1000.parquet +3 -0
- v2_test_counting_dataset_reasoning-v1-20m_1000.parquet +3 -0
- v2_train_counting_dataset_Medical-R1-Distill-Data_21000.parquet +3 -0
- v2_train_counting_dataset_OpenR1-Math-220k_90258.parquet +3 -0
- v2_train_counting_dataset_OpenThoughts-114k-Code_decontaminated_15372.parquet +3 -0
- v2_train_counting_dataset_OpenThoughts-114k-math_88120.parquet +3 -0
- v2_train_counting_dataset_reasoning-v1-20m_100982.parquet +3 -0
check_length.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
|
4 |
+
# 设置模型 tokenizer(例如使用 GPT-2)
|
5 |
+
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
|
6 |
+
|
7 |
+
# 加载 parquet 数据集(假设文件名为 data.parquet)
|
8 |
+
dataset = load_dataset("parquet", data_files="/workspace/0525_zyw/verl/counting/mk_data/counting_dataset_qwen25_max2048.parquet")
|
9 |
+
|
10 |
+
# 选择其中一个 split,例如 'train' 或默认的 'train'
|
11 |
+
data = dataset["train"]
|
12 |
+
|
13 |
+
# 记录超过 2048 tokens 的样本索引和内容
|
14 |
+
long_items = []
|
15 |
+
|
16 |
+
for idx, example in enumerate(data):
|
17 |
+
prompt = example.get("prompt", "")
|
18 |
+
tokens = tokenizer(prompt, truncation=False, return_tensors="pt")
|
19 |
+
input_len = tokens.input_ids.shape[1]
|
20 |
+
|
21 |
+
if input_len > 2048:
|
22 |
+
long_items.append({"index": idx, "length": input_len, "prompt": prompt})
|
23 |
+
|
24 |
+
print(f"Found {len(long_items)} items with more than 2048 tokens.")
|
25 |
+
|
26 |
+
# 可选:保存结果到 JSON 文件
|
27 |
+
import json
|
28 |
+
with open("long_prompts.json", "w", encoding="utf-8") as f:
|
29 |
+
json.dump(long_items, f, ensure_ascii=False, indent=2)
|
get_parquet.py
ADDED
@@ -0,0 +1,267 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
from datasets import load_dataset
|
3 |
+
from transformers import AutoTokenizer
|
4 |
+
import re
|
5 |
+
from tqdm import tqdm
|
6 |
+
import pandas as pd
|
7 |
+
import json
|
8 |
+
import concurrent.futures
|
9 |
+
|
10 |
+
# ---------- Helper Functions (Unchanged) ----------
|
11 |
+
def extract_think_and_rest(text):
|
12 |
+
"""提取 <think>...</think> 中的部分和剩余部分"""
|
13 |
+
think_blocks = re.findall(r"<think>(.*?)</think>", text, flags=re.DOTALL)
|
14 |
+
last_think_end = 0
|
15 |
+
for match in re.finditer(r"</think>", text):
|
16 |
+
last_think_end = match.end()
|
17 |
+
rest_text = text[last_think_end:].strip() if last_think_end else text.strip()
|
18 |
+
return think_blocks, rest_text
|
19 |
+
|
20 |
+
def extract_think_sections(text: str):
|
21 |
+
think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
|
22 |
+
if think_match:
|
23 |
+
think_content = think_match.group(1).strip()
|
24 |
+
end_pos = think_match.end()
|
25 |
+
post_think_content = text[end_pos:].strip()
|
26 |
+
if not think_content: # if <think></think>
|
27 |
+
raise ValueError("Empty think block")
|
28 |
+
return [think_content], post_think_content
|
29 |
+
else:
|
30 |
+
raise ValueError("Missing <think> block.")
|
31 |
+
|
32 |
+
def extract_think_and_solution_V2(text: str):
|
33 |
+
pattern = (
|
34 |
+
r"<\|begin_of_thought\|>(.*?)<\|end_of_thought\|>\s*"
|
35 |
+
r"<\|begin_of_solution\|>(.*?)<\|end_of_solution\|>"
|
36 |
+
)
|
37 |
+
match = re.search(pattern, text, re.DOTALL)
|
38 |
+
if match:
|
39 |
+
think_content = match.group(1).strip()
|
40 |
+
post_think_content = match.group(2).strip()
|
41 |
+
if not think_content:
|
42 |
+
raise ValueError("Empty thought block in V2.")
|
43 |
+
return [think_content], post_think_content
|
44 |
+
else:
|
45 |
+
raise ValueError("Missing required <|begin_of_thought|> or <|begin_of_solution|> blocks.")
|
46 |
+
|
47 |
+
# ---------- Worker function for multithreading (Unchanged) ----------
|
48 |
+
def process_single_item(args):
|
49 |
+
item, dataset_name, tokenizer, format_tokenizer, max_prompt_token_len_config = args
|
50 |
+
try:
|
51 |
+
if dataset_name == "OpenR1-Math-220k":
|
52 |
+
problem = item["problem"].strip()
|
53 |
+
response_full = item["generations"][0].strip()
|
54 |
+
reasoning_blocks, answer = extract_think_sections(response_full)
|
55 |
+
elif dataset_name == "OpenThoughts-114k-math":
|
56 |
+
problem = item["problem"].strip()
|
57 |
+
response_full = item["conversations"][1]["value"].strip()
|
58 |
+
reasoning_blocks, answer = extract_think_and_solution_V2(response_full)
|
59 |
+
elif dataset_name == "reasoning-v1-20m":
|
60 |
+
problem = item.get("prompt", "").strip()
|
61 |
+
response_full = item.get("response", "").strip()
|
62 |
+
reasoning_blocks, answer = extract_think_sections(response_full)
|
63 |
+
elif dataset_name == "OpenThoughts-114k-Code_decontaminated":
|
64 |
+
problem = item["problem"].strip()
|
65 |
+
# response_full = item.get("response", "").strip()
|
66 |
+
# reasoning_blocks, answer = extract_think_sections(response_full)
|
67 |
+
reasoning_blocks = [item["deepseek_reasoning"]]
|
68 |
+
answer = item["deepseek_solution"]
|
69 |
+
elif dataset_name == "Medical-R1-Distill-Data":
|
70 |
+
problem = item["question"].strip()
|
71 |
+
# response_full = item.get("response", "").strip()
|
72 |
+
# reasoning_blocks, answer = extract_think_sections(response_full)
|
73 |
+
reasoning_blocks = [item["reasoning (reasoning_content)"]]
|
74 |
+
answer = item["response (content)"]
|
75 |
+
else:
|
76 |
+
return None
|
77 |
+
|
78 |
+
if not reasoning_blocks or not reasoning_blocks[0]:
|
79 |
+
return None
|
80 |
+
|
81 |
+
reasoning = reasoning_blocks[0].strip()
|
82 |
+
solution = answer.strip()
|
83 |
+
|
84 |
+
input_token_count = len(tokenizer.tokenize(problem))
|
85 |
+
output_token_count = len(tokenizer.tokenize(solution))
|
86 |
+
reasoning_token_count = len(tokenizer.tokenize(reasoning))
|
87 |
+
|
88 |
+
instruct_info = (
|
89 |
+
"Given a <Problem> and its corresponding <Solution>, your task is to predict how many tokens are consumed in the process of arriving at the final <Solution> to the problem. Generally speaking, the more complex the problem is, the more tokens are required.\n"
|
90 |
+
f"<Problem>\n{problem}\n</Problem>\n\n"
|
91 |
+
f"<Solution>\n{solution}\n</Solution>\n"
|
92 |
+
f"The Problem has {input_token_count} tokens, and the Solution has {output_token_count} tokens.\n\n\n"
|
93 |
+
"Please provide a detailed chain-of-thought reasoning process and include your thought process within <think> tags. "
|
94 |
+
"Your final answer should be enclosed within <answer> tags.\n\n"
|
95 |
+
"Please return the predicted number of tokens in JSON format: \n```json\n{\"count\": int}\n```\n\n"
|
96 |
+
"Example format:\n"
|
97 |
+
"<think> Step-by-step reasoning, including self-reflection and corrections if necessary. [Limited by 1024 tokens] </think>\n"
|
98 |
+
"<answer> Summary of the thought process leading to the final token count and your predicted token count in json format: \n```json\n{\"count\": int}\n```\n [Limited by 512 tokens]\n"
|
99 |
+
"</answer>\n\n"
|
100 |
+
"Let me solve this step by step.\n"
|
101 |
+
)
|
102 |
+
|
103 |
+
cot_info = "<think>"
|
104 |
+
|
105 |
+
messages = [
|
106 |
+
{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
|
107 |
+
{"role": "user", "content": instruct_info.strip()},
|
108 |
+
]
|
109 |
+
|
110 |
+
prompt = format_tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
|
111 |
+
prompt += cot_info
|
112 |
+
|
113 |
+
prompt_token_len = len(format_tokenizer.tokenize(prompt))
|
114 |
+
|
115 |
+
if prompt_token_len <= max_prompt_token_len_config - 10:
|
116 |
+
return {
|
117 |
+
"prompt": prompt,
|
118 |
+
"ground_truth": reasoning_token_count,
|
119 |
+
"data_source": dataset_name,
|
120 |
+
"prompt_token_len": prompt_token_len,
|
121 |
+
}
|
122 |
+
return None
|
123 |
+
except ValueError:
|
124 |
+
return None
|
125 |
+
except Exception as e:
|
126 |
+
print(f"Error processing item for {dataset_name}: {e}")
|
127 |
+
return None
|
128 |
+
|
129 |
+
# ---------- 参数设置 (Unchanged for original purpose, but test_size's role in splitting changes) ----------
|
130 |
+
# train_size = 100000 # This now primarily influences num_items_to_sample_raw
|
131 |
+
test_size = 1000 # This now primarily influences num_items_to_sample_raw; actual test split is fixed
|
132 |
+
max_prompt_token_len = 4096
|
133 |
+
random_seed = 42 # This is the fixed random seed
|
134 |
+
NUM_THREADS = 16
|
135 |
+
|
136 |
+
# Set the fixed random seed for Python's `random` module
|
137 |
+
# This will affect `random.sample` used for splitting
|
138 |
+
random.seed(random_seed)
|
139 |
+
|
140 |
+
# ---------- 加载并打乱数据集 (Logic largely unchanged, seed is used by dataset.shuffle) ----------
|
141 |
+
datasets_config = {
|
142 |
+
# "OpenR1-Math-220k": "/workspace/0407_nips/data_preprocess/OpenR1-Math-220k/data",
|
143 |
+
# "reasoning-v1-20m": "/workspace/0407_nips/data_preprocess/reasoning-v1-20m/data",
|
144 |
+
# "OpenThoughts-114k-math": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-math/data",
|
145 |
+
"OpenThoughts-114k-Code_decontaminated": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-Code_decontaminated/data",
|
146 |
+
# "Medical-R1-Distill-Data": "/workspace/0407_nips/data_preprocess/Medical-R1-Distill-Data"
|
147 |
+
}
|
148 |
+
|
149 |
+
for name, path in datasets_config.items():
|
150 |
+
print(f"Processing dataset: {name}")
|
151 |
+
try:
|
152 |
+
dataset_hf = load_dataset(path, trust_remote_code=True)["train"]
|
153 |
+
except Exception as e:
|
154 |
+
print(f"Error loading dataset {name} from {path}: {e}")
|
155 |
+
continue
|
156 |
+
print(len(dataset_hf))
|
157 |
+
# num_items_to_sample_raw = train_size + test_size + 1000
|
158 |
+
|
159 |
+
# actual_num_to_sample = min(num_items_to_sample_raw, len(dataset_hf))
|
160 |
+
# if actual_num_to_sample < num_items_to_sample_raw:
|
161 |
+
# print(f"Warning: Dataset {name} has only {len(dataset_hf)} items. Sampling {actual_num_to_sample} instead of {num_items_to_sample_raw}.")
|
162 |
+
|
163 |
+
# if actual_num_to_sample == 0:
|
164 |
+
# print(f"Skipping dataset {name} as it has no items or actual_num_to_sample is 0.")
|
165 |
+
# continue
|
166 |
+
|
167 |
+
# # Shuffling raw dataset with the fixed seed
|
168 |
+
# dataset_selected = dataset_hf.shuffle(seed=random_seed).select(range(actual_num_to_sample))
|
169 |
+
# .select(range(102000))
|
170 |
+
dataset_selected = dataset_hf.shuffle(seed=random_seed)
|
171 |
+
|
172 |
+
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
|
173 |
+
format_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
|
174 |
+
|
175 |
+
processed_item_data_list = []
|
176 |
+
tasks_args_list = []
|
177 |
+
|
178 |
+
print(f"Preparing tasks for {name}...")
|
179 |
+
for item in dataset_selected:
|
180 |
+
tasks_args_list.append((item, name, tokenizer, format_tokenizer, max_prompt_token_len))
|
181 |
+
|
182 |
+
print(f"Submitting {len(tasks_args_list)} tasks to thread pool for {name}...")
|
183 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
|
184 |
+
future_to_item_args = {executor.submit(process_single_item, args): args for args in tasks_args_list}
|
185 |
+
|
186 |
+
for future in tqdm(concurrent.futures.as_completed(future_to_item_args), total=len(tasks_args_list), desc=f"Processing items for {name}"):
|
187 |
+
try:
|
188 |
+
result = future.result()
|
189 |
+
if result:
|
190 |
+
processed_item_data_list.append(result)
|
191 |
+
except Exception as exc:
|
192 |
+
# item_arg_tuple = future_to_item_args[future] # Uncomment if needed for debugging
|
193 |
+
print(f'Item generated an exception during future.result(): {exc}')
|
194 |
+
|
195 |
+
|
196 |
+
records = []
|
197 |
+
count = 0
|
198 |
+
print(f"Collected {len(processed_item_data_list)} valid processed items for {name}. Assigning IDs...")
|
199 |
+
# The original code had a cap here. We keep it.
|
200 |
+
# This `num_items_to_sample_raw` acts as an upper limit on total records considered for splitting.
|
201 |
+
for item_data in processed_item_data_list:
|
202 |
+
# if len(records) >= num_items_to_sample_raw:
|
203 |
+
# break
|
204 |
+
item_data["ids"] = f"{name}_{count}"
|
205 |
+
records.append(item_data)
|
206 |
+
count += 1
|
207 |
+
|
208 |
+
if not records:
|
209 |
+
print(f"No valid records generated for dataset {name} after filtering and ID assignment. Skipping saving.")
|
210 |
+
continue
|
211 |
+
|
212 |
+
# ---------- MODIFIED: 拆分训练集和测试集 ----------
|
213 |
+
# Test set is fixed at 1000 (or fewer if not enough data)
|
214 |
+
# Training set is everything else.
|
215 |
+
# random_seed is already set globally for `random` module.
|
216 |
+
|
217 |
+
target_test_set_size = 1000 # Your requirement
|
218 |
+
num_available_records = len(records)
|
219 |
+
|
220 |
+
train_records = []
|
221 |
+
test_records = []
|
222 |
+
|
223 |
+
if num_available_records == 0:
|
224 |
+
print(f"No records available for splitting for {name}.")
|
225 |
+
elif num_available_records <= target_test_set_size:
|
226 |
+
# If we have 1000 or fewer records, all go to test set, train is empty
|
227 |
+
print(f"Warning: Only {num_available_records} records available for {name}. All will be used for the test set.")
|
228 |
+
test_records = list(records) # Make a copy
|
229 |
+
train_records = []
|
230 |
+
else:
|
231 |
+
# We have more than 1000 records. Sample 1000 for test set.
|
232 |
+
# `random.sample` uses the seed set by `random.seed(random_seed)`
|
233 |
+
test_indices = sorted(random.sample(range(num_available_records), target_test_set_size))
|
234 |
+
|
235 |
+
current_test_idx_ptr = 0
|
236 |
+
for i in range(num_available_records):
|
237 |
+
if current_test_idx_ptr < len(test_indices) and i == test_indices[current_test_idx_ptr]:
|
238 |
+
test_records.append(records[i])
|
239 |
+
current_test_idx_ptr += 1
|
240 |
+
else:
|
241 |
+
train_records.append(records[i])
|
242 |
+
|
243 |
+
# Sanity check
|
244 |
+
if len(test_records) != target_test_set_size:
|
245 |
+
print(f"Error: Test set size mismatch. Expected {target_test_set_size}, got {len(test_records)}")
|
246 |
+
if len(train_records) != num_available_records - target_test_set_size:
|
247 |
+
print(f"Error: Train set size mismatch. Expected {num_available_records - target_test_set_size}, got {len(train_records)}")
|
248 |
+
|
249 |
+
|
250 |
+
# ---------- 保存 (Unchanged other than variable names if needed) ----------
|
251 |
+
if train_records:
|
252 |
+
df_train = pd.DataFrame(train_records)
|
253 |
+
df_train.to_parquet(f"v2_train_counting_dataset_{name}_{len(df_train)}.parquet", index=False)
|
254 |
+
else:
|
255 |
+
print(f"No training records to save for {name}.")
|
256 |
+
|
257 |
+
if test_records:
|
258 |
+
df_test = pd.DataFrame(test_records)
|
259 |
+
df_test.to_parquet(f"v2_test_counting_dataset_{name}_{len(df_test)}.parquet", index=False)
|
260 |
+
else:
|
261 |
+
print(f"No test records to save for {name}.")
|
262 |
+
|
263 |
+
print(f"✅ Successfully processed dataset {name}")
|
264 |
+
print(f" Saved Train samples: {len(train_records)}, Test samples: {len(test_records)}")
|
265 |
+
print("-" * 30)
|
266 |
+
|
267 |
+
print("All datasets processed.")
|
get_parquet_single.py
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
from datasets import load_dataset
|
3 |
+
from transformers import AutoTokenizer
|
4 |
+
import re
|
5 |
+
from tqdm import tqdm
|
6 |
+
import pandas as pd
|
7 |
+
import json
|
8 |
+
|
9 |
+
def extract_think_and_rest(text):
|
10 |
+
"""提取 <think>...</think> 中的部分和剩余部分"""
|
11 |
+
think_blocks = re.findall(r"<think>(.*?)</think>", text, flags=re.DOTALL)
|
12 |
+
last_think_end = 0
|
13 |
+
for match in re.finditer(r"</think>", text):
|
14 |
+
last_think_end = match.end()
|
15 |
+
rest_text = text[last_think_end:].strip() if last_think_end else text.strip()
|
16 |
+
return think_blocks, rest_text
|
17 |
+
|
18 |
+
def extract_think_sections(text: str):
|
19 |
+
think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
|
20 |
+
if think_match:
|
21 |
+
think_content = think_match.group(1).strip()
|
22 |
+
end_pos = think_match.end()
|
23 |
+
post_think_content = text[end_pos:].strip()
|
24 |
+
else:
|
25 |
+
think_content = None
|
26 |
+
post_think_content = text.strip()
|
27 |
+
raise ValueError
|
28 |
+
return think_content, post_think_content
|
29 |
+
|
30 |
+
|
31 |
+
def extract_think_and_solution_V2(text: str):
|
32 |
+
pattern = (
|
33 |
+
r"<\|begin_of_thought\|>(.*?)<\|end_of_thought\|>\s*"
|
34 |
+
r"<\|begin_of_solution\|>(.*?)<\|end_of_solution\|>"
|
35 |
+
)
|
36 |
+
match = re.search(pattern, text, re.DOTALL)
|
37 |
+
if match:
|
38 |
+
think_content = match.group(1).strip()
|
39 |
+
post_think_content = match.group(2).strip()
|
40 |
+
else:
|
41 |
+
think_content = None
|
42 |
+
post_think_content = text.strip()
|
43 |
+
raise ValueError("Missing required <|begin_of_thought|> or <|begin_of_solution|> blocks.")
|
44 |
+
return think_content, post_think_content
|
45 |
+
|
46 |
+
|
47 |
+
# ---------- 参数设置 ----------
|
48 |
+
train_size = 110000 # 训练集样本数量
|
49 |
+
test_size = 1000 # 测试集样本数量
|
50 |
+
max_prompt_token_len = 2048
|
51 |
+
random_seed = 42
|
52 |
+
num_workers = 16
|
53 |
+
# ---------- 加载并打乱数据集 ----------
|
54 |
+
|
55 |
+
datasets_config = {
|
56 |
+
"OpenR1-Math-220k": "/workspace/0407_nips/data_preprocess/OpenR1-Math-220k/data",
|
57 |
+
# "reasoning-v1-20m": "/workspace/0407_nips/data_preprocess/reasoning-v1-20m/data",
|
58 |
+
"OpenThoughts-114k-math": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-math/data",
|
59 |
+
# "OpenThoughts-114k-Code_decontaminated": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-Code_decontaminated/data",
|
60 |
+
# "Medical-R1-Distill-Data": "/workspace/0407_nips/data_preprocess/Medical-R1-Distill-Data"
|
61 |
+
}
|
62 |
+
|
63 |
+
for name, path in datasets_config.items():
|
64 |
+
print(f"{name}")
|
65 |
+
dataset = load_dataset(path)["train"]
|
66 |
+
# shuffled_dataset = dataset.shuffle(seed=42)
|
67 |
+
|
68 |
+
# dataset = load_dataset('/workspace/0407_nips/data_preprocess/reasoning-v1-20m/data')["train"]
|
69 |
+
total_size = train_size + test_size + 1000
|
70 |
+
dataset = dataset.shuffle(seed=random_seed).select(range(total_size))
|
71 |
+
|
72 |
+
# ---------- 初始化 Tokenizer ----------
|
73 |
+
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
|
74 |
+
format_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
|
75 |
+
|
76 |
+
records = []
|
77 |
+
count = 0
|
78 |
+
|
79 |
+
# ---------- 处理样本 ----------
|
80 |
+
for item in tqdm(dataset, desc="Processing samples"):
|
81 |
+
if len(records) >= total_size:
|
82 |
+
break
|
83 |
+
if name == "OpenR1-Math-220k":
|
84 |
+
problem = item["problem"].strip()
|
85 |
+
response_full = item["generations"][0].strip()
|
86 |
+
|
87 |
+
reasoning_blocks, answer = extract_think_sections(response_full)
|
88 |
+
if not reasoning_blocks:
|
89 |
+
continue
|
90 |
+
reasoning = reasoning_blocks[0].strip()
|
91 |
+
solution = answer.strip()
|
92 |
+
elif name == "OpenThoughts-114k-math":
|
93 |
+
problem = item["problem"].strip()
|
94 |
+
response_full = item["conversations"][1]["value"].strip()
|
95 |
+
|
96 |
+
reasoning_blocks, answer = extract_think_and_solution_V2(response_full)
|
97 |
+
if not reasoning_blocks:
|
98 |
+
continue
|
99 |
+
reasoning = reasoning_blocks[0].strip()
|
100 |
+
solution = answer.strip()
|
101 |
+
elif name == "reasoning-v1-20m":
|
102 |
+
problem = item.get("prompt", "").strip()
|
103 |
+
response_full = item.get("response", "").strip()
|
104 |
+
|
105 |
+
reasoning_blocks, answer = extract_think_sections(response_full)
|
106 |
+
if not reasoning_blocks:
|
107 |
+
continue
|
108 |
+
reasoning = reasoning_blocks[0].strip()
|
109 |
+
solution = answer.strip()
|
110 |
+
|
111 |
+
input_token_count = len(tokenizer.tokenize(problem))
|
112 |
+
output_token_count = len(tokenizer.tokenize(solution))
|
113 |
+
reasoning_token_count = len(tokenizer.tokenize(reasoning))
|
114 |
+
|
115 |
+
instruct_info = (
|
116 |
+
"Your task is to estimate how many tokens a reasoning model would use to solve the following Problem and Solution.\n"
|
117 |
+
"Please return the predicted number of tokens in JSON format: ```json\n{\"count\": int}\n```\n\n"
|
118 |
+
f"<Problem>\n{problem}\n</Problem>\n\n"
|
119 |
+
f"<Solution>\n{solution}\n</Solution>\n"
|
120 |
+
f"The Problem has {input_token_count} tokens, and the Solution has {output_token_count} tokens.\n\n\n"
|
121 |
+
"Please provide a detailed chain-of-thought reasoning process and include your thought process within `<think>` tags. "
|
122 |
+
"Your final answer should be enclosed within `<answer>` tags.\n\n"
|
123 |
+
"Ensure that your counting logic is sound and that your explanation is token-efficient.\n\n"
|
124 |
+
"Example format:\n"
|
125 |
+
"<think> Step-by-step reasoning, including self-reflection and corrections if necessary. [Limited by 1024 tokens] </think>\n"
|
126 |
+
"<answer> Summary of the thought process leading to the final token count and your predicted token count in json format: ```json\n{\"count\": int}\n``` [Limited by 512 tokens]\n"
|
127 |
+
"</answer>\n\n"
|
128 |
+
)
|
129 |
+
|
130 |
+
cot_info = "Let me solve this step by step.\n"
|
131 |
+
|
132 |
+
messages = [
|
133 |
+
{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
|
134 |
+
{"role": "user", "content": instruct_info.strip()},
|
135 |
+
]
|
136 |
+
|
137 |
+
prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
|
138 |
+
prompt += cot_info
|
139 |
+
|
140 |
+
prompt_token_len = len(format_tokenizer.tokenize(prompt))
|
141 |
+
|
142 |
+
if prompt_token_len <= max_prompt_token_len - 10:
|
143 |
+
records.append({
|
144 |
+
"prompt": prompt,
|
145 |
+
"ground_truth": reasoning_token_count,
|
146 |
+
"data_source": name,
|
147 |
+
"ids": f"{name}_{count}",
|
148 |
+
"prompt_token_len": prompt_token_len,
|
149 |
+
})
|
150 |
+
count += 1
|
151 |
+
|
152 |
+
if len(records) >= total_size:
|
153 |
+
break
|
154 |
+
|
155 |
+
# ---------- 拆分训练集和测试集 ----------
|
156 |
+
train_records = records[:train_size]
|
157 |
+
test_records = records[train_size:train_size + test_size]
|
158 |
+
|
159 |
+
# ---------- 保存 ----------
|
160 |
+
df_train = pd.DataFrame(train_records)
|
161 |
+
df_test = pd.DataFrame(test_records)
|
162 |
+
|
163 |
+
df_train.to_parquet(f"train_counting_dataset_{name}_{train_size}.parquet", index=False)
|
164 |
+
df_test.to_parquet(f"test_counting_dataset_{name}_{test_size}.parquet", index=False)
|
165 |
+
|
166 |
+
print("✅ 成功生成训练和测试数据集")
|
167 |
+
print(f"Train samples: {len(df_train)}, Test samples: {len(df_test)}")
|
load_check.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
# 显示所有列,不省略
|
5 |
+
pd.set_option('display.max_columns', None)
|
6 |
+
|
7 |
+
# 显示所有行(对 head() 没影响,但可用于 df 全体显示时)
|
8 |
+
pd.set_option('display.max_rows', None)
|
9 |
+
|
10 |
+
# 不截断列内容,显示完整字符串
|
11 |
+
pd.set_option('display.max_colwidth', None)
|
12 |
+
|
13 |
+
# 设置最大宽度,防止自动换行
|
14 |
+
pd.set_option('display.width', 1000)
|
15 |
+
# 读取 parquet 文件(自动识别使用的 engine,比如 pyarrow 或 fastparquet)
|
16 |
+
df = pd.read_parquet("/workspace/0525_zyw/verl/counting/mk_data/v2_train_counting_dataset_OpenR1-Math-220k_90258.parquet", engine="auto")
|
17 |
+
|
18 |
+
print(df.shape)
|
19 |
+
# print(df.columns)
|
20 |
+
# # 查看前几条数据
|
21 |
+
# print(df.loc[0]) # 默认前 5 条,可以传入参数指定行数,比如 df.head(10)
|
22 |
+
# for i, row in df.iterrows():
|
23 |
+
# print(f"Row {i}:\n{row['prompt']}\n---")
|
24 |
+
# if i > 1: # 只打印前 5 行
|
25 |
+
# break
|
long_prompts.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[]
|
merge_train.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import glob
|
3 |
+
import random
|
4 |
+
|
5 |
+
random_seed = 42
|
6 |
+
sample_size = 15000
|
7 |
+
|
8 |
+
# 1. 找到所有 parquet 文件
|
9 |
+
parquet_files = glob.glob("v2_train_counting_dataset_*.parquet")
|
10 |
+
|
11 |
+
selected_parquet_files = []
|
12 |
+
for parquet_file in parquet_files:
|
13 |
+
if "v2_train_counting_dataset_OpenThoughts-114k-math_88120.parquet" not in parquet_file:
|
14 |
+
selected_parquet_files.append(parquet_file)
|
15 |
+
print("找到的parquet文件:", selected_parquet_files)
|
16 |
+
|
17 |
+
# 2. 合并所有数据
|
18 |
+
all_data = []
|
19 |
+
for file in selected_parquet_files:
|
20 |
+
print(file)
|
21 |
+
df = pd.read_parquet(file)
|
22 |
+
all_data.append(df)
|
23 |
+
df_all = pd.concat(all_data, ignore_index=True)
|
24 |
+
print("合并后总数据量:", len(df_all))
|
25 |
+
|
26 |
+
# 3. 按 data_source 分组,每组采样 25k
|
27 |
+
sampled_dfs = []
|
28 |
+
for name, group in df_all.groupby("data_source"):
|
29 |
+
if len(group) > sample_size:
|
30 |
+
sampled = group.sample(n=sample_size, random_state=random_seed)
|
31 |
+
else:
|
32 |
+
sampled = group
|
33 |
+
sampled_dfs.append(sampled)
|
34 |
+
print(f"{name}: 原始{len(group)}条,采样{len(sampled)}条")
|
35 |
+
|
36 |
+
# 4. 合并采样后的数据
|
37 |
+
df_sampled = pd.concat(sampled_dfs, ignore_index=True)
|
38 |
+
print("采样后总数据量:", len(df_sampled))
|
39 |
+
|
40 |
+
shuffled_df = df_sampled.sample(frac=1, random_state=random_seed).reset_index(drop=True)
|
41 |
+
|
42 |
+
# 5. 保存
|
43 |
+
shuffled_df.to_parquet("merged_sampled_4datasets_15k_each.parquet", index=False)
|
44 |
+
print("已保存到 merged_sampled_4datasets_15k_each.parquet")
|
v2_test_counting_dataset_Medical-R1-Distill-Data_1000.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:89bc5389f4e66b52170d2e0c9922171cb7f63a679e4520ab99209bafda362819
|
3 |
+
size 1079992
|
v2_test_counting_dataset_OpenR1-Math-220k_1000.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2454554feb69d07e233ef4a3f40f76d9bf537ed052e8bbc7416c420b4dc92b3d
|
3 |
+
size 952195
|
v2_test_counting_dataset_OpenThoughts-114k-Code_decontaminated_1000.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:beda857c76c38d5a06916b487bafb2cffb76cb6fc31b6fe89698a6f54f8bcf2d
|
3 |
+
size 2202832
|
v2_test_counting_dataset_OpenThoughts-114k-math_1000.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:90069eb9de435395355312e63597e014d3801b293e9bf5e2d9263ced6a67d19c
|
3 |
+
size 998292
|
v2_test_counting_dataset_reasoning-v1-20m_1000.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6b98f56c5005e65956ac406021f6e9a98744da8aaec748ddf4e7086970e3cdc2
|
3 |
+
size 1665071
|
v2_train_counting_dataset_Medical-R1-Distill-Data_21000.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9c1c02b93013d57b2dd8c874a13a86e2804b9e25931903945dd93bf0606312fb
|
3 |
+
size 21995473
|
v2_train_counting_dataset_OpenR1-Math-220k_90258.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b940e63fcaed0d5eff15e40123e63b1dad99ccf9e37071c7d8a308e760e90db9
|
3 |
+
size 84122607
|
v2_train_counting_dataset_OpenThoughts-114k-Code_decontaminated_15372.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:18a52d5aa00a9140ca4c0c20a03f8ec7edac9ffc62eb02036ecef0c0c7e12eb1
|
3 |
+
size 33672180
|
v2_train_counting_dataset_OpenThoughts-114k-math_88120.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5a1b7889afc082e16c4e70524a67f99784b825e7a799cb9314ba1fae32688a2b
|
3 |
+
size 85576595
|
v2_train_counting_dataset_reasoning-v1-20m_100982.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7808bc50c3f2edf5d4d80aac04cdfb13820d1b5f65d85e17ae6f00d680ac4466
|
3 |
+
size 166027524
|