File size: 12,608 Bytes

98595da

import random
from datasets import load_dataset
from transformers import AutoTokenizer
import re
from tqdm import tqdm
import pandas as pd
import json
import concurrent.futures

# ---------- Helper Functions (Unchanged) ----------
def extract_think_and_rest(text):
    """提取 <think>...</think> 中的部分和剩余部分"""
    think_blocks = re.findall(r"<think>(.*?)</think>", text, flags=re.DOTALL)
    last_think_end = 0
    for match in re.finditer(r"</think>", text):
        last_think_end = match.end()
    rest_text = text[last_think_end:].strip() if last_think_end else text.strip()
    return think_blocks, rest_text

def extract_think_sections(text: str):
    think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
    if think_match:
        think_content = think_match.group(1).strip()
        end_pos = think_match.end()
        post_think_content = text[end_pos:].strip()
        if not think_content: # if <think></think>
             raise ValueError("Empty think block")
        return [think_content], post_think_content
    else:
        raise ValueError("Missing <think> block.")

def extract_think_and_solution_V2(text: str):
    pattern = (
        r"<\|begin_of_thought\|>(.*?)<\|end_of_thought\|>\s*"
        r"<\|begin_of_solution\|>(.*?)<\|end_of_solution\|>"
    )
    match = re.search(pattern, text, re.DOTALL)
    if match:
        think_content = match.group(1).strip()
        post_think_content = match.group(2).strip()
        if not think_content:
            raise ValueError("Empty thought block in V2.")
        return [think_content], post_think_content
    else:
        raise ValueError("Missing required <|begin_of_thought|> or <|begin_of_solution|> blocks.")

# ---------- Worker function for multithreading (Unchanged) ----------
def process_single_item(args):
    item, dataset_name, tokenizer, format_tokenizer, max_prompt_token_len_config = args
    try:
        if dataset_name == "OpenR1-Math-220k":
            problem = item["problem"].strip()
            response_full = item["generations"][0].strip()
            reasoning_blocks, answer = extract_think_sections(response_full)
        elif dataset_name == "OpenThoughts-114k-math":
            problem = item["problem"].strip()
            response_full = item["conversations"][1]["value"].strip()
            reasoning_blocks, answer = extract_think_and_solution_V2(response_full)
        elif dataset_name == "reasoning-v1-20m":
            problem = item.get("prompt", "").strip()
            response_full = item.get("response", "").strip()
            reasoning_blocks, answer = extract_think_sections(response_full)
        elif dataset_name == "OpenThoughts-114k-Code_decontaminated":
            problem = item["problem"].strip()
            # response_full = item.get("response", "").strip()
            # reasoning_blocks, answer = extract_think_sections(response_full)
            reasoning_blocks = [item["deepseek_reasoning"]]
            answer = item["deepseek_solution"]
        elif dataset_name == "Medical-R1-Distill-Data":
            problem = item["question"].strip()
            # response_full = item.get("response", "").strip()
            # reasoning_blocks, answer = extract_think_sections(response_full)
            reasoning_blocks = [item["reasoning (reasoning_content)"]]
            answer = item["response (content)"]
        else:
            return None

        if not reasoning_blocks or not reasoning_blocks[0]:
            return None

        reasoning = reasoning_blocks[0].strip()
        solution = answer.strip()

        input_token_count = len(tokenizer.tokenize(problem))
        output_token_count = len(tokenizer.tokenize(solution))
        reasoning_token_count = len(tokenizer.tokenize(reasoning))

        instruct_info = (
            "Given a <Problem> and its corresponding <Solution>, your task is to predict how many tokens are consumed in the process of arriving at the final <Solution> to the problem. Generally speaking, the more complex the problem is, the more tokens are required.\n"
            f"<Problem>\n{problem}\n</Problem>\n\n"
            f"<Solution>\n{solution}\n</Solution>\n"
            f"The Problem has {input_token_count} tokens, and the Solution has {output_token_count} tokens.\n\n\n"
            "Please provide a detailed chain-of-thought reasoning process and include your thought process within <think> tags. "
            "Your final answer should be enclosed within <answer> tags.\n\n"
            "Please return the predicted number of tokens in JSON format: \n```json\n{\"count\": int}\n```\n\n"
            "Example format:\n"
            "<think> Step-by-step reasoning, including self-reflection and corrections if necessary. [Limited by 1024 tokens] </think>\n"
            "<answer> Summary of the thought process leading to the final token count and your predicted token count in json format: \n```json\n{\"count\": int}\n```\n [Limited by 512 tokens]\n"
            "</answer>\n\n"
            "Let me solve this step by step.\n"
        )

        cot_info = "<think>"

        messages = [
            {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
            {"role": "user", "content": instruct_info.strip()},
        ]

        prompt = format_tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
        prompt += cot_info

        prompt_token_len = len(format_tokenizer.tokenize(prompt))

        if prompt_token_len <= max_prompt_token_len_config - 10:
            return {
                "prompt": prompt,
                "ground_truth": reasoning_token_count,
                "data_source": dataset_name,
                "prompt_token_len": prompt_token_len,
            }
        return None
    except ValueError:
        return None
    except Exception as e:
        print(f"Error processing item for {dataset_name}: {e}")
        return None

# ---------- 参数设置 (Unchanged for original purpose, but test_size's role in splitting changes) ----------
# train_size = 100000  # This now primarily influences num_items_to_sample_raw
test_size = 1000    # This now primarily influences num_items_to_sample_raw; actual test split is fixed
max_prompt_token_len = 4096
random_seed = 42 # This is the fixed random seed
NUM_THREADS = 16

# Set the fixed random seed for Python's `random` module
# This will affect `random.sample` used for splitting
random.seed(random_seed)

# ---------- 加载并打乱数据集 (Logic largely unchanged, seed is used by dataset.shuffle) ----------
datasets_config = {
    # "OpenR1-Math-220k": "/workspace/0407_nips/data_preprocess/OpenR1-Math-220k/data",
    # "reasoning-v1-20m": "/workspace/0407_nips/data_preprocess/reasoning-v1-20m/data",
    # "OpenThoughts-114k-math": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-math/data",
    "OpenThoughts-114k-Code_decontaminated": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-Code_decontaminated/data",
    # "Medical-R1-Distill-Data": "/workspace/0407_nips/data_preprocess/Medical-R1-Distill-Data"
}

for name, path in datasets_config.items():
    print(f"Processing dataset: {name}")
    try:
        dataset_hf = load_dataset(path, trust_remote_code=True)["train"]
    except Exception as e:
        print(f"Error loading dataset {name} from {path}: {e}")
        continue
    print(len(dataset_hf))
    # num_items_to_sample_raw = train_size + test_size + 1000
    
    # actual_num_to_sample = min(num_items_to_sample_raw, len(dataset_hf))
    # if actual_num_to_sample < num_items_to_sample_raw:
    #     print(f"Warning: Dataset {name} has only {len(dataset_hf)} items. Sampling {actual_num_to_sample} instead of {num_items_to_sample_raw}.")

    # if actual_num_to_sample == 0:
    #     print(f"Skipping dataset {name} as it has no items or actual_num_to_sample is 0.")
    #     continue

    # # Shuffling raw dataset with the fixed seed
    # dataset_selected = dataset_hf.shuffle(seed=random_seed).select(range(actual_num_to_sample))
    # .select(range(102000))
    dataset_selected = dataset_hf.shuffle(seed=random_seed)

    tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
    format_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")

    processed_item_data_list = []
    tasks_args_list = []

    print(f"Preparing tasks for {name}...")
    for item in dataset_selected:
        tasks_args_list.append((item, name, tokenizer, format_tokenizer, max_prompt_token_len))

    print(f"Submitting {len(tasks_args_list)} tasks to thread pool for {name}...")
    with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
        future_to_item_args = {executor.submit(process_single_item, args): args for args in tasks_args_list}
        
        for future in tqdm(concurrent.futures.as_completed(future_to_item_args), total=len(tasks_args_list), desc=f"Processing items for {name}"):
            try:
                result = future.result()
                if result:
                    processed_item_data_list.append(result)
            except Exception as exc:
                # item_arg_tuple = future_to_item_args[future] # Uncomment if needed for debugging
                print(f'Item generated an exception during future.result(): {exc}')


    records = []
    count = 0
    print(f"Collected {len(processed_item_data_list)} valid processed items for {name}. Assigning IDs...")
    # The original code had a cap here. We keep it.
    # This `num_items_to_sample_raw` acts as an upper limit on total records considered for splitting.
    for item_data in processed_item_data_list:
        # if len(records) >= num_items_to_sample_raw: 
        #     break
        item_data["ids"] = f"{name}_{count}"
        records.append(item_data)
        count += 1
    
    if not records:
        print(f"No valid records generated for dataset {name} after filtering and ID assignment. Skipping saving.")
        continue

    # ---------- MODIFIED: 拆分训练集和测试集 ----------
    # Test set is fixed at 1000 (or fewer if not enough data)
    # Training set is everything else.
    # random_seed is already set globally for `random` module.

    target_test_set_size = 1000 # Your requirement
    num_available_records = len(records)
    
    train_records = []
    test_records = []

    if num_available_records == 0:
        print(f"No records available for splitting for {name}.")
    elif num_available_records <= target_test_set_size:
        # If we have 1000 or fewer records, all go to test set, train is empty
        print(f"Warning: Only {num_available_records} records available for {name}. All will be used for the test set.")
        test_records = list(records) # Make a copy
        train_records = []
    else:
        # We have more than 1000 records. Sample 1000 for test set.
        # `random.sample` uses the seed set by `random.seed(random_seed)`
        test_indices = sorted(random.sample(range(num_available_records), target_test_set_size))
        
        current_test_idx_ptr = 0
        for i in range(num_available_records):
            if current_test_idx_ptr < len(test_indices) and i == test_indices[current_test_idx_ptr]:
                test_records.append(records[i])
                current_test_idx_ptr += 1
            else:
                train_records.append(records[i])
        
        # Sanity check
        if len(test_records) != target_test_set_size:
             print(f"Error: Test set size mismatch. Expected {target_test_set_size}, got {len(test_records)}")
        if len(train_records) != num_available_records - target_test_set_size:
             print(f"Error: Train set size mismatch. Expected {num_available_records - target_test_set_size}, got {len(train_records)}")


    # ---------- 保存 (Unchanged other than variable names if needed) ----------
    if train_records:
        df_train = pd.DataFrame(train_records)
        df_train.to_parquet(f"v2_train_counting_dataset_{name}_{len(df_train)}.parquet", index=False)
    else:
        print(f"No training records to save for {name}.")

    if test_records:
        df_test = pd.DataFrame(test_records)
        df_test.to_parquet(f"v2_test_counting_dataset_{name}_{len(df_test)}.parquet", index=False)
    else:
        print(f"No test records to save for {name}.")
        
    print(f"✅ Successfully processed dataset {name}")
    print(f"  Saved Train samples: {len(train_records)}, Test samples: {len(test_records)}")
    print("-" * 30)

print("All datasets processed.")