File size: 7,102 Bytes
98595da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import random
from datasets import load_dataset
from transformers import AutoTokenizer
import re
from tqdm import tqdm
import pandas as pd
import json

def extract_think_and_rest(text):
    """提取 <think>...</think> 中的部分和剩余部分"""
    think_blocks = re.findall(r"<think>(.*?)</think>", text, flags=re.DOTALL)
    last_think_end = 0
    for match in re.finditer(r"</think>", text):
        last_think_end = match.end()
    rest_text = text[last_think_end:].strip() if last_think_end else text.strip()
    return think_blocks, rest_text

def extract_think_sections(text: str):
    think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
    if think_match:
        think_content = think_match.group(1).strip()
        end_pos = think_match.end()
        post_think_content = text[end_pos:].strip()
    else:
        think_content = None
        post_think_content = text.strip()
        raise ValueError
    return think_content, post_think_content


def extract_think_and_solution_V2(text: str):
    pattern = (
        r"<\|begin_of_thought\|>(.*?)<\|end_of_thought\|>\s*"
        r"<\|begin_of_solution\|>(.*?)<\|end_of_solution\|>"
    )
    match = re.search(pattern, text, re.DOTALL)
    if match:
        think_content = match.group(1).strip()
        post_think_content = match.group(2).strip()
    else:
        think_content = None
        post_think_content = text.strip()
        raise ValueError("Missing required <|begin_of_thought|> or <|begin_of_solution|> blocks.")
    return think_content, post_think_content


# ---------- 参数设置 ----------
train_size = 110000  # 训练集样本数量
test_size = 1000    # 测试集样本数量
max_prompt_token_len = 2048
random_seed = 42
num_workers = 16
# ---------- 加载并打乱数据集 ----------

datasets_config = {
    "OpenR1-Math-220k": "/workspace/0407_nips/data_preprocess/OpenR1-Math-220k/data",
    # "reasoning-v1-20m": "/workspace/0407_nips/data_preprocess/reasoning-v1-20m/data",
    "OpenThoughts-114k-math": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-math/data",
    # "OpenThoughts-114k-Code_decontaminated": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-Code_decontaminated/data",
    # "Medical-R1-Distill-Data": "/workspace/0407_nips/data_preprocess/Medical-R1-Distill-Data"
}

for name, path in datasets_config.items():
    print(f"{name}")
    dataset = load_dataset(path)["train"]
    # shuffled_dataset = dataset.shuffle(seed=42)

    # dataset = load_dataset('/workspace/0407_nips/data_preprocess/reasoning-v1-20m/data')["train"]
    total_size = train_size + test_size + 1000
    dataset = dataset.shuffle(seed=random_seed).select(range(total_size))

    # ---------- 初始化 Tokenizer ----------
    tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
    format_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")

    records = []
    count = 0

    # ---------- 处理样本 ----------
    for item in tqdm(dataset, desc="Processing samples"):
        if len(records) >= total_size:
            break
        if name == "OpenR1-Math-220k":
            problem = item["problem"].strip()
            response_full = item["generations"][0].strip()

            reasoning_blocks, answer = extract_think_sections(response_full)
            if not reasoning_blocks:
                continue
            reasoning = reasoning_blocks[0].strip()
            solution = answer.strip()
        elif name == "OpenThoughts-114k-math":
            problem = item["problem"].strip()
            response_full = item["conversations"][1]["value"].strip()

            reasoning_blocks, answer = extract_think_and_solution_V2(response_full)
            if not reasoning_blocks:
                continue
            reasoning = reasoning_blocks[0].strip()
            solution = answer.strip()
        elif name == "reasoning-v1-20m":
            problem = item.get("prompt", "").strip()
            response_full = item.get("response", "").strip()

            reasoning_blocks, answer = extract_think_sections(response_full)
            if not reasoning_blocks:
                continue
            reasoning = reasoning_blocks[0].strip()
            solution = answer.strip()

        input_token_count = len(tokenizer.tokenize(problem))
        output_token_count = len(tokenizer.tokenize(solution))
        reasoning_token_count = len(tokenizer.tokenize(reasoning))

        instruct_info = (
            "Your task is to estimate how many tokens a reasoning model would use to solve the following Problem and Solution.\n"
            "Please return the predicted number of tokens in JSON format: ```json\n{\"count\": int}\n```\n\n"
            f"<Problem>\n{problem}\n</Problem>\n\n"
            f"<Solution>\n{solution}\n</Solution>\n"
            f"The Problem has {input_token_count} tokens, and the Solution has {output_token_count} tokens.\n\n\n"
            "Please provide a detailed chain-of-thought reasoning process and include your thought process within `<think>` tags. "
            "Your final answer should be enclosed within `<answer>` tags.\n\n"
            "Ensure that your counting logic is sound and that your explanation is token-efficient.\n\n"
            "Example format:\n"
            "<think> Step-by-step reasoning, including self-reflection and corrections if necessary. [Limited by 1024 tokens] </think>\n"
            "<answer> Summary of the thought process leading to the final token count and your predicted token count in json format: ```json\n{\"count\": int}\n``` [Limited by 512 tokens]\n"
            "</answer>\n\n"
        )

        cot_info = "Let me solve this step by step.\n"

        messages = [
            {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
            {"role": "user", "content": instruct_info.strip()},
        ]

        prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
        prompt += cot_info

        prompt_token_len = len(format_tokenizer.tokenize(prompt))

        if prompt_token_len <= max_prompt_token_len - 10:
            records.append({
                "prompt": prompt,
                "ground_truth": reasoning_token_count,
                "data_source": name,
                "ids": f"{name}_{count}",
                "prompt_token_len": prompt_token_len,
            })
            count += 1

        if len(records) >= total_size:
            break

    # ---------- 拆分训练集和测试集 ----------
    train_records = records[:train_size]
    test_records = records[train_size:train_size + test_size]

    # ---------- 保存 ----------
    df_train = pd.DataFrame(train_records)
    df_test = pd.DataFrame(test_records)

    df_train.to_parquet(f"train_counting_dataset_{name}_{train_size}.parquet", index=False)
    df_test.to_parquet(f"test_counting_dataset_{name}_{test_size}.parquet", index=False)

    print("✅ 成功生成训练和测试数据集")
    print(f"Train samples: {len(df_train)}, Test samples: {len(df_test)}")