|
import random |
|
from datasets import load_dataset |
|
from transformers import AutoTokenizer |
|
import re |
|
from tqdm import tqdm |
|
import pandas as pd |
|
import json |
|
import concurrent.futures |
|
|
|
|
|
def extract_think_and_rest(text): |
|
"""提取 <think>...</think> 中的部分和剩余部分""" |
|
think_blocks = re.findall(r"<think>(.*?)</think>", text, flags=re.DOTALL) |
|
last_think_end = 0 |
|
for match in re.finditer(r"</think>", text): |
|
last_think_end = match.end() |
|
rest_text = text[last_think_end:].strip() if last_think_end else text.strip() |
|
return think_blocks, rest_text |
|
|
|
def extract_think_sections(text: str): |
|
think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL) |
|
if think_match: |
|
think_content = think_match.group(1).strip() |
|
end_pos = think_match.end() |
|
post_think_content = text[end_pos:].strip() |
|
if not think_content: |
|
raise ValueError("Empty think block") |
|
return [think_content], post_think_content |
|
else: |
|
raise ValueError("Missing <think> block.") |
|
|
|
def extract_think_and_solution_V2(text: str): |
|
pattern = ( |
|
r"<\|begin_of_thought\|>(.*?)<\|end_of_thought\|>\s*" |
|
r"<\|begin_of_solution\|>(.*?)<\|end_of_solution\|>" |
|
) |
|
match = re.search(pattern, text, re.DOTALL) |
|
if match: |
|
think_content = match.group(1).strip() |
|
post_think_content = match.group(2).strip() |
|
if not think_content: |
|
raise ValueError("Empty thought block in V2.") |
|
return [think_content], post_think_content |
|
else: |
|
raise ValueError("Missing required <|begin_of_thought|> or <|begin_of_solution|> blocks.") |
|
|
|
|
|
def process_single_item(args): |
|
item, dataset_name, tokenizer, format_tokenizer, max_prompt_token_len_config = args |
|
try: |
|
if dataset_name == "OpenR1-Math-220k": |
|
problem = item["problem"].strip() |
|
response_full = item["generations"][0].strip() |
|
reasoning_blocks, answer = extract_think_sections(response_full) |
|
elif dataset_name == "OpenThoughts-114k-math": |
|
problem = item["problem"].strip() |
|
response_full = item["conversations"][1]["value"].strip() |
|
reasoning_blocks, answer = extract_think_and_solution_V2(response_full) |
|
elif dataset_name == "reasoning-v1-20m": |
|
problem = item.get("prompt", "").strip() |
|
response_full = item.get("response", "").strip() |
|
reasoning_blocks, answer = extract_think_sections(response_full) |
|
elif dataset_name == "OpenThoughts-114k-Code_decontaminated": |
|
problem = item["problem"].strip() |
|
|
|
|
|
reasoning_blocks = [item["deepseek_reasoning"]] |
|
answer = item["deepseek_solution"] |
|
elif dataset_name == "Medical-R1-Distill-Data": |
|
problem = item["question"].strip() |
|
|
|
|
|
reasoning_blocks = [item["reasoning (reasoning_content)"]] |
|
answer = item["response (content)"] |
|
else: |
|
return None |
|
|
|
if not reasoning_blocks or not reasoning_blocks[0]: |
|
return None |
|
|
|
reasoning = reasoning_blocks[0].strip() |
|
solution = answer.strip() |
|
|
|
input_token_count = len(tokenizer.tokenize(problem)) |
|
output_token_count = len(tokenizer.tokenize(solution)) |
|
reasoning_token_count = len(tokenizer.tokenize(reasoning)) |
|
|
|
instruct_info = ( |
|
"Given a <Problem> and its corresponding <Solution>, your task is to predict how many tokens are consumed in the process of arriving at the final <Solution> to the problem. Generally speaking, the more complex the problem is, the more tokens are required.\n" |
|
f"<Problem>\n{problem}\n</Problem>\n\n" |
|
f"<Solution>\n{solution}\n</Solution>\n" |
|
f"The Problem has {input_token_count} tokens, and the Solution has {output_token_count} tokens.\n\n\n" |
|
"Please provide a detailed chain-of-thought reasoning process and include your thought process within <think> tags. " |
|
"Your final answer should be enclosed within <answer> tags.\n\n" |
|
"Please return the predicted number of tokens in JSON format: \n```json\n{\"count\": int}\n```\n\n" |
|
"Example format:\n" |
|
"<think> Step-by-step reasoning, including self-reflection and corrections if necessary. [Limited by 1024 tokens] </think>\n" |
|
"<answer> Summary of the thought process leading to the final token count and your predicted token count in json format: \n```json\n{\"count\": int}\n```\n [Limited by 512 tokens]\n" |
|
"</answer>\n\n" |
|
"Let me solve this step by step.\n" |
|
) |
|
|
|
cot_info = "<think>" |
|
|
|
messages = [ |
|
{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."}, |
|
{"role": "user", "content": instruct_info.strip()}, |
|
] |
|
|
|
prompt = format_tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) |
|
prompt += cot_info |
|
|
|
prompt_token_len = len(format_tokenizer.tokenize(prompt)) |
|
|
|
if prompt_token_len <= max_prompt_token_len_config - 10: |
|
return { |
|
"prompt": prompt, |
|
"ground_truth": reasoning_token_count, |
|
"data_source": dataset_name, |
|
"prompt_token_len": prompt_token_len, |
|
} |
|
return None |
|
except ValueError: |
|
return None |
|
except Exception as e: |
|
print(f"Error processing item for {dataset_name}: {e}") |
|
return None |
|
|
|
|
|
|
|
test_size = 1000 |
|
max_prompt_token_len = 4096 |
|
random_seed = 42 |
|
NUM_THREADS = 16 |
|
|
|
|
|
|
|
random.seed(random_seed) |
|
|
|
|
|
datasets_config = { |
|
|
|
|
|
|
|
"OpenThoughts-114k-Code_decontaminated": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-Code_decontaminated/data", |
|
|
|
} |
|
|
|
for name, path in datasets_config.items(): |
|
print(f"Processing dataset: {name}") |
|
try: |
|
dataset_hf = load_dataset(path, trust_remote_code=True)["train"] |
|
except Exception as e: |
|
print(f"Error loading dataset {name} from {path}: {e}") |
|
continue |
|
print(len(dataset_hf)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dataset_selected = dataset_hf.shuffle(seed=random_seed) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1") |
|
format_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct") |
|
|
|
processed_item_data_list = [] |
|
tasks_args_list = [] |
|
|
|
print(f"Preparing tasks for {name}...") |
|
for item in dataset_selected: |
|
tasks_args_list.append((item, name, tokenizer, format_tokenizer, max_prompt_token_len)) |
|
|
|
print(f"Submitting {len(tasks_args_list)} tasks to thread pool for {name}...") |
|
with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor: |
|
future_to_item_args = {executor.submit(process_single_item, args): args for args in tasks_args_list} |
|
|
|
for future in tqdm(concurrent.futures.as_completed(future_to_item_args), total=len(tasks_args_list), desc=f"Processing items for {name}"): |
|
try: |
|
result = future.result() |
|
if result: |
|
processed_item_data_list.append(result) |
|
except Exception as exc: |
|
|
|
print(f'Item generated an exception during future.result(): {exc}') |
|
|
|
|
|
records = [] |
|
count = 0 |
|
print(f"Collected {len(processed_item_data_list)} valid processed items for {name}. Assigning IDs...") |
|
|
|
|
|
for item_data in processed_item_data_list: |
|
|
|
|
|
item_data["ids"] = f"{name}_{count}" |
|
records.append(item_data) |
|
count += 1 |
|
|
|
if not records: |
|
print(f"No valid records generated for dataset {name} after filtering and ID assignment. Skipping saving.") |
|
continue |
|
|
|
|
|
|
|
|
|
|
|
|
|
target_test_set_size = 1000 |
|
num_available_records = len(records) |
|
|
|
train_records = [] |
|
test_records = [] |
|
|
|
if num_available_records == 0: |
|
print(f"No records available for splitting for {name}.") |
|
elif num_available_records <= target_test_set_size: |
|
|
|
print(f"Warning: Only {num_available_records} records available for {name}. All will be used for the test set.") |
|
test_records = list(records) |
|
train_records = [] |
|
else: |
|
|
|
|
|
test_indices = sorted(random.sample(range(num_available_records), target_test_set_size)) |
|
|
|
current_test_idx_ptr = 0 |
|
for i in range(num_available_records): |
|
if current_test_idx_ptr < len(test_indices) and i == test_indices[current_test_idx_ptr]: |
|
test_records.append(records[i]) |
|
current_test_idx_ptr += 1 |
|
else: |
|
train_records.append(records[i]) |
|
|
|
|
|
if len(test_records) != target_test_set_size: |
|
print(f"Error: Test set size mismatch. Expected {target_test_set_size}, got {len(test_records)}") |
|
if len(train_records) != num_available_records - target_test_set_size: |
|
print(f"Error: Train set size mismatch. Expected {num_available_records - target_test_set_size}, got {len(train_records)}") |
|
|
|
|
|
|
|
if train_records: |
|
df_train = pd.DataFrame(train_records) |
|
df_train.to_parquet(f"v2_train_counting_dataset_{name}_{len(df_train)}.parquet", index=False) |
|
else: |
|
print(f"No training records to save for {name}.") |
|
|
|
if test_records: |
|
df_test = pd.DataFrame(test_records) |
|
df_test.to_parquet(f"v2_test_counting_dataset_{name}_{len(df_test)}.parquet", index=False) |
|
else: |
|
print(f"No test records to save for {name}.") |
|
|
|
print(f"✅ Successfully processed dataset {name}") |
|
print(f" Saved Train samples: {len(train_records)}, Test samples: {len(test_records)}") |
|
print("-" * 30) |
|
|
|
print("All datasets processed.") |