File size: 12,608 Bytes
98595da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
import random
from datasets import load_dataset
from transformers import AutoTokenizer
import re
from tqdm import tqdm
import pandas as pd
import json
import concurrent.futures

# ---------- Helper Functions (Unchanged) ----------
def extract_think_and_rest(text):
    """提取 <think>...</think> 中的部分和剩余部分"""
    think_blocks = re.findall(r"<think>(.*?)</think>", text, flags=re.DOTALL)
    last_think_end = 0
    for match in re.finditer(r"</think>", text):
        last_think_end = match.end()
    rest_text = text[last_think_end:].strip() if last_think_end else text.strip()
    return think_blocks, rest_text

def extract_think_sections(text: str):
    think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
    if think_match:
        think_content = think_match.group(1).strip()
        end_pos = think_match.end()
        post_think_content = text[end_pos:].strip()
        if not think_content: # if <think></think>
             raise ValueError("Empty think block")
        return [think_content], post_think_content
    else:
        raise ValueError("Missing <think> block.")

def extract_think_and_solution_V2(text: str):
    pattern = (
        r"<\|begin_of_thought\|>(.*?)<\|end_of_thought\|>\s*"
        r"<\|begin_of_solution\|>(.*?)<\|end_of_solution\|>"
    )
    match = re.search(pattern, text, re.DOTALL)
    if match:
        think_content = match.group(1).strip()
        post_think_content = match.group(2).strip()
        if not think_content:
            raise ValueError("Empty thought block in V2.")
        return [think_content], post_think_content
    else:
        raise ValueError("Missing required <|begin_of_thought|> or <|begin_of_solution|> blocks.")

# ---------- Worker function for multithreading (Unchanged) ----------
def process_single_item(args):
    item, dataset_name, tokenizer, format_tokenizer, max_prompt_token_len_config = args
    try:
        if dataset_name == "OpenR1-Math-220k":
            problem = item["problem"].strip()
            response_full = item["generations"][0].strip()
            reasoning_blocks, answer = extract_think_sections(response_full)
        elif dataset_name == "OpenThoughts-114k-math":
            problem = item["problem"].strip()
            response_full = item["conversations"][1]["value"].strip()
            reasoning_blocks, answer = extract_think_and_solution_V2(response_full)
        elif dataset_name == "reasoning-v1-20m":
            problem = item.get("prompt", "").strip()
            response_full = item.get("response", "").strip()
            reasoning_blocks, answer = extract_think_sections(response_full)
        elif dataset_name == "OpenThoughts-114k-Code_decontaminated":
            problem = item["problem"].strip()
            # response_full = item.get("response", "").strip()
            # reasoning_blocks, answer = extract_think_sections(response_full)
            reasoning_blocks = [item["deepseek_reasoning"]]
            answer = item["deepseek_solution"]
        elif dataset_name == "Medical-R1-Distill-Data":
            problem = item["question"].strip()
            # response_full = item.get("response", "").strip()
            # reasoning_blocks, answer = extract_think_sections(response_full)
            reasoning_blocks = [item["reasoning (reasoning_content)"]]
            answer = item["response (content)"]
        else:
            return None

        if not reasoning_blocks or not reasoning_blocks[0]:
            return None

        reasoning = reasoning_blocks[0].strip()
        solution = answer.strip()

        input_token_count = len(tokenizer.tokenize(problem))
        output_token_count = len(tokenizer.tokenize(solution))
        reasoning_token_count = len(tokenizer.tokenize(reasoning))

        instruct_info = (
            "Given a <Problem> and its corresponding <Solution>, your task is to predict how many tokens are consumed in the process of arriving at the final <Solution> to the problem. Generally speaking, the more complex the problem is, the more tokens are required.\n"
            f"<Problem>\n{problem}\n</Problem>\n\n"
            f"<Solution>\n{solution}\n</Solution>\n"
            f"The Problem has {input_token_count} tokens, and the Solution has {output_token_count} tokens.\n\n\n"
            "Please provide a detailed chain-of-thought reasoning process and include your thought process within <think> tags. "
            "Your final answer should be enclosed within <answer> tags.\n\n"
            "Please return the predicted number of tokens in JSON format: \n```json\n{\"count\": int}\n```\n\n"
            "Example format:\n"
            "<think> Step-by-step reasoning, including self-reflection and corrections if necessary. [Limited by 1024 tokens] </think>\n"
            "<answer> Summary of the thought process leading to the final token count and your predicted token count in json format: \n```json\n{\"count\": int}\n```\n [Limited by 512 tokens]\n"
            "</answer>\n\n"
            "Let me solve this step by step.\n"
        )

        cot_info = "<think>"

        messages = [
            {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
            {"role": "user", "content": instruct_info.strip()},
        ]

        prompt = format_tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
        prompt += cot_info

        prompt_token_len = len(format_tokenizer.tokenize(prompt))

        if prompt_token_len <= max_prompt_token_len_config - 10:
            return {
                "prompt": prompt,
                "ground_truth": reasoning_token_count,
                "data_source": dataset_name,
                "prompt_token_len": prompt_token_len,
            }
        return None
    except ValueError:
        return None
    except Exception as e:
        print(f"Error processing item for {dataset_name}: {e}")
        return None

# ---------- 参数设置 (Unchanged for original purpose, but test_size's role in splitting changes) ----------
# train_size = 100000  # This now primarily influences num_items_to_sample_raw
test_size = 1000    # This now primarily influences num_items_to_sample_raw; actual test split is fixed
max_prompt_token_len = 4096
random_seed = 42 # This is the fixed random seed
NUM_THREADS = 16

# Set the fixed random seed for Python's `random` module
# This will affect `random.sample` used for splitting
random.seed(random_seed)

# ---------- 加载并打乱数据集 (Logic largely unchanged, seed is used by dataset.shuffle) ----------
datasets_config = {
    # "OpenR1-Math-220k": "/workspace/0407_nips/data_preprocess/OpenR1-Math-220k/data",
    # "reasoning-v1-20m": "/workspace/0407_nips/data_preprocess/reasoning-v1-20m/data",
    # "OpenThoughts-114k-math": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-math/data",
    "OpenThoughts-114k-Code_decontaminated": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-Code_decontaminated/data",
    # "Medical-R1-Distill-Data": "/workspace/0407_nips/data_preprocess/Medical-R1-Distill-Data"
}

for name, path in datasets_config.items():
    print(f"Processing dataset: {name}")
    try:
        dataset_hf = load_dataset(path, trust_remote_code=True)["train"]
    except Exception as e:
        print(f"Error loading dataset {name} from {path}: {e}")
        continue
    print(len(dataset_hf))
    # num_items_to_sample_raw = train_size + test_size + 1000
    
    # actual_num_to_sample = min(num_items_to_sample_raw, len(dataset_hf))
    # if actual_num_to_sample < num_items_to_sample_raw:
    #     print(f"Warning: Dataset {name} has only {len(dataset_hf)} items. Sampling {actual_num_to_sample} instead of {num_items_to_sample_raw}.")

    # if actual_num_to_sample == 0:
    #     print(f"Skipping dataset {name} as it has no items or actual_num_to_sample is 0.")
    #     continue

    # # Shuffling raw dataset with the fixed seed
    # dataset_selected = dataset_hf.shuffle(seed=random_seed).select(range(actual_num_to_sample))
    # .select(range(102000))
    dataset_selected = dataset_hf.shuffle(seed=random_seed)

    tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
    format_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")

    processed_item_data_list = []
    tasks_args_list = []

    print(f"Preparing tasks for {name}...")
    for item in dataset_selected:
        tasks_args_list.append((item, name, tokenizer, format_tokenizer, max_prompt_token_len))

    print(f"Submitting {len(tasks_args_list)} tasks to thread pool for {name}...")
    with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
        future_to_item_args = {executor.submit(process_single_item, args): args for args in tasks_args_list}
        
        for future in tqdm(concurrent.futures.as_completed(future_to_item_args), total=len(tasks_args_list), desc=f"Processing items for {name}"):
            try:
                result = future.result()
                if result:
                    processed_item_data_list.append(result)
            except Exception as exc:
                # item_arg_tuple = future_to_item_args[future] # Uncomment if needed for debugging
                print(f'Item generated an exception during future.result(): {exc}')


    records = []
    count = 0
    print(f"Collected {len(processed_item_data_list)} valid processed items for {name}. Assigning IDs...")
    # The original code had a cap here. We keep it.
    # This `num_items_to_sample_raw` acts as an upper limit on total records considered for splitting.
    for item_data in processed_item_data_list:
        # if len(records) >= num_items_to_sample_raw: 
        #     break
        item_data["ids"] = f"{name}_{count}"
        records.append(item_data)
        count += 1
    
    if not records:
        print(f"No valid records generated for dataset {name} after filtering and ID assignment. Skipping saving.")
        continue

    # ---------- MODIFIED: 拆分训练集和测试集 ----------
    # Test set is fixed at 1000 (or fewer if not enough data)
    # Training set is everything else.
    # random_seed is already set globally for `random` module.

    target_test_set_size = 1000 # Your requirement
    num_available_records = len(records)
    
    train_records = []
    test_records = []

    if num_available_records == 0:
        print(f"No records available for splitting for {name}.")
    elif num_available_records <= target_test_set_size:
        # If we have 1000 or fewer records, all go to test set, train is empty
        print(f"Warning: Only {num_available_records} records available for {name}. All will be used for the test set.")
        test_records = list(records) # Make a copy
        train_records = []
    else:
        # We have more than 1000 records. Sample 1000 for test set.
        # `random.sample` uses the seed set by `random.seed(random_seed)`
        test_indices = sorted(random.sample(range(num_available_records), target_test_set_size))
        
        current_test_idx_ptr = 0
        for i in range(num_available_records):
            if current_test_idx_ptr < len(test_indices) and i == test_indices[current_test_idx_ptr]:
                test_records.append(records[i])
                current_test_idx_ptr += 1
            else:
                train_records.append(records[i])
        
        # Sanity check
        if len(test_records) != target_test_set_size:
             print(f"Error: Test set size mismatch. Expected {target_test_set_size}, got {len(test_records)}")
        if len(train_records) != num_available_records - target_test_set_size:
             print(f"Error: Train set size mismatch. Expected {num_available_records - target_test_set_size}, got {len(train_records)}")


    # ---------- 保存 (Unchanged other than variable names if needed) ----------
    if train_records:
        df_train = pd.DataFrame(train_records)
        df_train.to_parquet(f"v2_train_counting_dataset_{name}_{len(df_train)}.parquet", index=False)
    else:
        print(f"No training records to save for {name}.")

    if test_records:
        df_test = pd.DataFrame(test_records)
        df_test.to_parquet(f"v2_test_counting_dataset_{name}_{len(df_test)}.parquet", index=False)
    else:
        print(f"No test records to save for {name}.")
        
    print(f"✅ Successfully processed dataset {name}")
    print(f"  Saved Train samples: {len(train_records)}, Test samples: {len(test_records)}")
    print("-" * 30)

print("All datasets processed.")