|
import random |
|
from datasets import load_dataset |
|
from transformers import AutoTokenizer |
|
import re |
|
from tqdm import tqdm |
|
import pandas as pd |
|
import json |
|
|
|
def extract_think_and_rest(text): |
|
"""提取 <think>...</think> 中的部分和剩余部分""" |
|
think_blocks = re.findall(r"<think>(.*?)</think>", text, flags=re.DOTALL) |
|
last_think_end = 0 |
|
for match in re.finditer(r"</think>", text): |
|
last_think_end = match.end() |
|
rest_text = text[last_think_end:].strip() if last_think_end else text.strip() |
|
return think_blocks, rest_text |
|
|
|
def extract_think_sections(text: str): |
|
think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL) |
|
if think_match: |
|
think_content = think_match.group(1).strip() |
|
end_pos = think_match.end() |
|
post_think_content = text[end_pos:].strip() |
|
else: |
|
think_content = None |
|
post_think_content = text.strip() |
|
raise ValueError |
|
return think_content, post_think_content |
|
|
|
|
|
def extract_think_and_solution_V2(text: str): |
|
pattern = ( |
|
r"<\|begin_of_thought\|>(.*?)<\|end_of_thought\|>\s*" |
|
r"<\|begin_of_solution\|>(.*?)<\|end_of_solution\|>" |
|
) |
|
match = re.search(pattern, text, re.DOTALL) |
|
if match: |
|
think_content = match.group(1).strip() |
|
post_think_content = match.group(2).strip() |
|
else: |
|
think_content = None |
|
post_think_content = text.strip() |
|
raise ValueError("Missing required <|begin_of_thought|> or <|begin_of_solution|> blocks.") |
|
return think_content, post_think_content |
|
|
|
|
|
|
|
train_size = 110000 |
|
test_size = 1000 |
|
max_prompt_token_len = 2048 |
|
random_seed = 42 |
|
num_workers = 16 |
|
|
|
|
|
datasets_config = { |
|
"OpenR1-Math-220k": "/workspace/0407_nips/data_preprocess/OpenR1-Math-220k/data", |
|
|
|
"OpenThoughts-114k-math": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-math/data", |
|
|
|
|
|
} |
|
|
|
for name, path in datasets_config.items(): |
|
print(f"{name}") |
|
dataset = load_dataset(path)["train"] |
|
|
|
|
|
|
|
total_size = train_size + test_size + 1000 |
|
dataset = dataset.shuffle(seed=random_seed).select(range(total_size)) |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1") |
|
format_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct") |
|
|
|
records = [] |
|
count = 0 |
|
|
|
|
|
for item in tqdm(dataset, desc="Processing samples"): |
|
if len(records) >= total_size: |
|
break |
|
if name == "OpenR1-Math-220k": |
|
problem = item["problem"].strip() |
|
response_full = item["generations"][0].strip() |
|
|
|
reasoning_blocks, answer = extract_think_sections(response_full) |
|
if not reasoning_blocks: |
|
continue |
|
reasoning = reasoning_blocks[0].strip() |
|
solution = answer.strip() |
|
elif name == "OpenThoughts-114k-math": |
|
problem = item["problem"].strip() |
|
response_full = item["conversations"][1]["value"].strip() |
|
|
|
reasoning_blocks, answer = extract_think_and_solution_V2(response_full) |
|
if not reasoning_blocks: |
|
continue |
|
reasoning = reasoning_blocks[0].strip() |
|
solution = answer.strip() |
|
elif name == "reasoning-v1-20m": |
|
problem = item.get("prompt", "").strip() |
|
response_full = item.get("response", "").strip() |
|
|
|
reasoning_blocks, answer = extract_think_sections(response_full) |
|
if not reasoning_blocks: |
|
continue |
|
reasoning = reasoning_blocks[0].strip() |
|
solution = answer.strip() |
|
|
|
input_token_count = len(tokenizer.tokenize(problem)) |
|
output_token_count = len(tokenizer.tokenize(solution)) |
|
reasoning_token_count = len(tokenizer.tokenize(reasoning)) |
|
|
|
instruct_info = ( |
|
"Your task is to estimate how many tokens a reasoning model would use to solve the following Problem and Solution.\n" |
|
"Please return the predicted number of tokens in JSON format: ```json\n{\"count\": int}\n```\n\n" |
|
f"<Problem>\n{problem}\n</Problem>\n\n" |
|
f"<Solution>\n{solution}\n</Solution>\n" |
|
f"The Problem has {input_token_count} tokens, and the Solution has {output_token_count} tokens.\n\n\n" |
|
"Please provide a detailed chain-of-thought reasoning process and include your thought process within `<think>` tags. " |
|
"Your final answer should be enclosed within `<answer>` tags.\n\n" |
|
"Ensure that your counting logic is sound and that your explanation is token-efficient.\n\n" |
|
"Example format:\n" |
|
"<think> Step-by-step reasoning, including self-reflection and corrections if necessary. [Limited by 1024 tokens] </think>\n" |
|
"<answer> Summary of the thought process leading to the final token count and your predicted token count in json format: ```json\n{\"count\": int}\n``` [Limited by 512 tokens]\n" |
|
"</answer>\n\n" |
|
) |
|
|
|
cot_info = "Let me solve this step by step.\n" |
|
|
|
messages = [ |
|
{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."}, |
|
{"role": "user", "content": instruct_info.strip()}, |
|
] |
|
|
|
prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) |
|
prompt += cot_info |
|
|
|
prompt_token_len = len(format_tokenizer.tokenize(prompt)) |
|
|
|
if prompt_token_len <= max_prompt_token_len - 10: |
|
records.append({ |
|
"prompt": prompt, |
|
"ground_truth": reasoning_token_count, |
|
"data_source": name, |
|
"ids": f"{name}_{count}", |
|
"prompt_token_len": prompt_token_len, |
|
}) |
|
count += 1 |
|
|
|
if len(records) >= total_size: |
|
break |
|
|
|
|
|
train_records = records[:train_size] |
|
test_records = records[train_size:train_size + test_size] |
|
|
|
|
|
df_train = pd.DataFrame(train_records) |
|
df_test = pd.DataFrame(test_records) |
|
|
|
df_train.to_parquet(f"train_counting_dataset_{name}_{train_size}.parquet", index=False) |
|
df_test.to_parquet(f"test_counting_dataset_{name}_{test_size}.parquet", index=False) |
|
|
|
print("✅ 成功生成训练和测试数据集") |
|
print(f"Train samples: {len(df_train)}, Test samples: {len(df_test)}") |
|
|