import random from datasets import load_dataset from transformers import AutoTokenizer import re from tqdm import tqdm import pandas as pd import json def extract_think_and_rest(text): """提取 ... 中的部分和剩余部分""" think_blocks = re.findall(r"(.*?)", text, flags=re.DOTALL) last_think_end = 0 for match in re.finditer(r"", text): last_think_end = match.end() rest_text = text[last_think_end:].strip() if last_think_end else text.strip() return think_blocks, rest_text def extract_think_sections(text: str): think_match = re.search(r"(.*?)", text, re.DOTALL) if think_match: think_content = think_match.group(1).strip() end_pos = think_match.end() post_think_content = text[end_pos:].strip() else: think_content = None post_think_content = text.strip() raise ValueError return think_content, post_think_content def extract_think_and_solution_V2(text: str): pattern = ( r"<\|begin_of_thought\|>(.*?)<\|end_of_thought\|>\s*" r"<\|begin_of_solution\|>(.*?)<\|end_of_solution\|>" ) match = re.search(pattern, text, re.DOTALL) if match: think_content = match.group(1).strip() post_think_content = match.group(2).strip() else: think_content = None post_think_content = text.strip() raise ValueError("Missing required <|begin_of_thought|> or <|begin_of_solution|> blocks.") return think_content, post_think_content # ---------- 参数设置 ---------- train_size = 110000 # 训练集样本数量 test_size = 1000 # 测试集样本数量 max_prompt_token_len = 2048 random_seed = 42 num_workers = 16 # ---------- 加载并打乱数据集 ---------- datasets_config = { "OpenR1-Math-220k": "/workspace/0407_nips/data_preprocess/OpenR1-Math-220k/data", # "reasoning-v1-20m": "/workspace/0407_nips/data_preprocess/reasoning-v1-20m/data", "OpenThoughts-114k-math": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-math/data", # "OpenThoughts-114k-Code_decontaminated": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-Code_decontaminated/data", # "Medical-R1-Distill-Data": "/workspace/0407_nips/data_preprocess/Medical-R1-Distill-Data" } for name, path in datasets_config.items(): print(f"{name}") dataset = load_dataset(path)["train"] # shuffled_dataset = dataset.shuffle(seed=42) # dataset = load_dataset('/workspace/0407_nips/data_preprocess/reasoning-v1-20m/data')["train"] total_size = train_size + test_size + 1000 dataset = dataset.shuffle(seed=random_seed).select(range(total_size)) # ---------- 初始化 Tokenizer ---------- tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1") format_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct") records = [] count = 0 # ---------- 处理样本 ---------- for item in tqdm(dataset, desc="Processing samples"): if len(records) >= total_size: break if name == "OpenR1-Math-220k": problem = item["problem"].strip() response_full = item["generations"][0].strip() reasoning_blocks, answer = extract_think_sections(response_full) if not reasoning_blocks: continue reasoning = reasoning_blocks[0].strip() solution = answer.strip() elif name == "OpenThoughts-114k-math": problem = item["problem"].strip() response_full = item["conversations"][1]["value"].strip() reasoning_blocks, answer = extract_think_and_solution_V2(response_full) if not reasoning_blocks: continue reasoning = reasoning_blocks[0].strip() solution = answer.strip() elif name == "reasoning-v1-20m": problem = item.get("prompt", "").strip() response_full = item.get("response", "").strip() reasoning_blocks, answer = extract_think_sections(response_full) if not reasoning_blocks: continue reasoning = reasoning_blocks[0].strip() solution = answer.strip() input_token_count = len(tokenizer.tokenize(problem)) output_token_count = len(tokenizer.tokenize(solution)) reasoning_token_count = len(tokenizer.tokenize(reasoning)) instruct_info = ( "Your task is to estimate how many tokens a reasoning model would use to solve the following Problem and Solution.\n" "Please return the predicted number of tokens in JSON format: ```json\n{\"count\": int}\n```\n\n" f"\n{problem}\n\n\n" f"\n{solution}\n\n" f"The Problem has {input_token_count} tokens, and the Solution has {output_token_count} tokens.\n\n\n" "Please provide a detailed chain-of-thought reasoning process and include your thought process within `` tags. " "Your final answer should be enclosed within `` tags.\n\n" "Ensure that your counting logic is sound and that your explanation is token-efficient.\n\n" "Example format:\n" " Step-by-step reasoning, including self-reflection and corrections if necessary. [Limited by 1024 tokens] \n" " Summary of the thought process leading to the final token count and your predicted token count in json format: ```json\n{\"count\": int}\n``` [Limited by 512 tokens]\n" "\n\n" ) cot_info = "Let me solve this step by step.\n" messages = [ {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."}, {"role": "user", "content": instruct_info.strip()}, ] prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) prompt += cot_info prompt_token_len = len(format_tokenizer.tokenize(prompt)) if prompt_token_len <= max_prompt_token_len - 10: records.append({ "prompt": prompt, "ground_truth": reasoning_token_count, "data_source": name, "ids": f"{name}_{count}", "prompt_token_len": prompt_token_len, }) count += 1 if len(records) >= total_size: break # ---------- 拆分训练集和测试集 ---------- train_records = records[:train_size] test_records = records[train_size:train_size + test_size] # ---------- 保存 ---------- df_train = pd.DataFrame(train_records) df_test = pd.DataFrame(test_records) df_train.to_parquet(f"train_counting_dataset_{name}_{train_size}.parquet", index=False) df_test.to_parquet(f"test_counting_dataset_{name}_{test_size}.parquet", index=False) print("✅ 成功生成训练和测试数据集") print(f"Train samples: {len(df_train)}, Test samples: {len(df_test)}")