File size: 7,102 Bytes

98595da

import random
from datasets import load_dataset
from transformers import AutoTokenizer
import re
from tqdm import tqdm
import pandas as pd
import json

def extract_think_and_rest(text):
    """提取 <think>...</think> 中的部分和剩余部分"""
    think_blocks = re.findall(r"<think>(.*?)</think>", text, flags=re.DOTALL)
    last_think_end = 0
    for match in re.finditer(r"</think>", text):
        last_think_end = match.end()
    rest_text = text[last_think_end:].strip() if last_think_end else text.strip()
    return think_blocks, rest_text

def extract_think_sections(text: str):
    think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
    if think_match:
        think_content = think_match.group(1).strip()
        end_pos = think_match.end()
        post_think_content = text[end_pos:].strip()
    else:
        think_content = None
        post_think_content = text.strip()
        raise ValueError
    return think_content, post_think_content


def extract_think_and_solution_V2(text: str):
    pattern = (
        r"<\|begin_of_thought\|>(.*?)<\|end_of_thought\|>\s*"
        r"<\|begin_of_solution\|>(.*?)<\|end_of_solution\|>"
    )
    match = re.search(pattern, text, re.DOTALL)
    if match:
        think_content = match.group(1).strip()
        post_think_content = match.group(2).strip()
    else:
        think_content = None
        post_think_content = text.strip()
        raise ValueError("Missing required <|begin_of_thought|> or <|begin_of_solution|> blocks.")
    return think_content, post_think_content


# ---------- 参数设置 ----------
train_size = 110000  # 训练集样本数量
test_size = 1000    # 测试集样本数量
max_prompt_token_len = 2048
random_seed = 42
num_workers = 16
# ---------- 加载并打乱数据集 ----------

datasets_config = {
    "OpenR1-Math-220k": "/workspace/0407_nips/data_preprocess/OpenR1-Math-220k/data",
    # "reasoning-v1-20m": "/workspace/0407_nips/data_preprocess/reasoning-v1-20m/data",
    "OpenThoughts-114k-math": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-math/data",
    # "OpenThoughts-114k-Code_decontaminated": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-Code_decontaminated/data",
    # "Medical-R1-Distill-Data": "/workspace/0407_nips/data_preprocess/Medical-R1-Distill-Data"
}

for name, path in datasets_config.items():
    print(f"{name}")
    dataset = load_dataset(path)["train"]
    # shuffled_dataset = dataset.shuffle(seed=42)

    # dataset = load_dataset('/workspace/0407_nips/data_preprocess/reasoning-v1-20m/data')["train"]
    total_size = train_size + test_size + 1000
    dataset = dataset.shuffle(seed=random_seed).select(range(total_size))

    # ---------- 初始化 Tokenizer ----------
    tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
    format_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")

    records = []
    count = 0

    # ---------- 处理样本 ----------
    for item in tqdm(dataset, desc="Processing samples"):
        if len(records) >= total_size:
            break
        if name == "OpenR1-Math-220k":
            problem = item["problem"].strip()
            response_full = item["generations"][0].strip()

            reasoning_blocks, answer = extract_think_sections(response_full)
            if not reasoning_blocks:
                continue
            reasoning = reasoning_blocks[0].strip()
            solution = answer.strip()
        elif name == "OpenThoughts-114k-math":
            problem = item["problem"].strip()
            response_full = item["conversations"][1]["value"].strip()

            reasoning_blocks, answer = extract_think_and_solution_V2(response_full)
            if not reasoning_blocks:
                continue
            reasoning = reasoning_blocks[0].strip()
            solution = answer.strip()
        elif name == "reasoning-v1-20m":
            problem = item.get("prompt", "").strip()
            response_full = item.get("response", "").strip()

            reasoning_blocks, answer = extract_think_sections(response_full)
            if not reasoning_blocks:
                continue
            reasoning = reasoning_blocks[0].strip()
            solution = answer.strip()

        input_token_count = len(tokenizer.tokenize(problem))
        output_token_count = len(tokenizer.tokenize(solution))
        reasoning_token_count = len(tokenizer.tokenize(reasoning))

        instruct_info = (
            "Your task is to estimate how many tokens a reasoning model would use to solve the following Problem and Solution.\n"
            "Please return the predicted number of tokens in JSON format: ```json\n{\"count\": int}\n```\n\n"
            f"<Problem>\n{problem}\n</Problem>\n\n"
            f"<Solution>\n{solution}\n</Solution>\n"
            f"The Problem has {input_token_count} tokens, and the Solution has {output_token_count} tokens.\n\n\n"
            "Please provide a detailed chain-of-thought reasoning process and include your thought process within `<think>` tags. "
            "Your final answer should be enclosed within `<answer>` tags.\n\n"
            "Ensure that your counting logic is sound and that your explanation is token-efficient.\n\n"
            "Example format:\n"
            "<think> Step-by-step reasoning, including self-reflection and corrections if necessary. [Limited by 1024 tokens] </think>\n"
            "<answer> Summary of the thought process leading to the final token count and your predicted token count in json format: ```json\n{\"count\": int}\n``` [Limited by 512 tokens]\n"
            "</answer>\n\n"
        )

        cot_info = "Let me solve this step by step.\n"

        messages = [
            {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
            {"role": "user", "content": instruct_info.strip()},
        ]

        prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
        prompt += cot_info

        prompt_token_len = len(format_tokenizer.tokenize(prompt))

        if prompt_token_len <= max_prompt_token_len - 10:
            records.append({
                "prompt": prompt,
                "ground_truth": reasoning_token_count,
                "data_source": name,
                "ids": f"{name}_{count}",
                "prompt_token_len": prompt_token_len,
            })
            count += 1

        if len(records) >= total_size:
            break

    # ---------- 拆分训练集和测试集 ----------
    train_records = records[:train_size]
    test_records = records[train_size:train_size + test_size]

    # ---------- 保存 ----------
    df_train = pd.DataFrame(train_records)
    df_test = pd.DataFrame(test_records)

    df_train.to_parquet(f"train_counting_dataset_{name}_{train_size}.parquet", index=False)
    df_test.to_parquet(f"test_counting_dataset_{name}_{test_size}.parquet", index=False)

    print("✅ 成功生成训练和测试数据集")
    print(f"Train samples: {len(df_train)}, Test samples: {len(df_test)}")