predictive_auditing_data / get_parquet.py
s1ghhh's picture
Upload folder using huggingface_hub
98595da verified
import random
from datasets import load_dataset
from transformers import AutoTokenizer
import re
from tqdm import tqdm
import pandas as pd
import json
import concurrent.futures
# ---------- Helper Functions (Unchanged) ----------
def extract_think_and_rest(text):
"""提取 <think>...</think> 中的部分和剩余部分"""
think_blocks = re.findall(r"<think>(.*?)</think>", text, flags=re.DOTALL)
last_think_end = 0
for match in re.finditer(r"</think>", text):
last_think_end = match.end()
rest_text = text[last_think_end:].strip() if last_think_end else text.strip()
return think_blocks, rest_text
def extract_think_sections(text: str):
think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
if think_match:
think_content = think_match.group(1).strip()
end_pos = think_match.end()
post_think_content = text[end_pos:].strip()
if not think_content: # if <think></think>
raise ValueError("Empty think block")
return [think_content], post_think_content
else:
raise ValueError("Missing <think> block.")
def extract_think_and_solution_V2(text: str):
pattern = (
r"<\|begin_of_thought\|>(.*?)<\|end_of_thought\|>\s*"
r"<\|begin_of_solution\|>(.*?)<\|end_of_solution\|>"
)
match = re.search(pattern, text, re.DOTALL)
if match:
think_content = match.group(1).strip()
post_think_content = match.group(2).strip()
if not think_content:
raise ValueError("Empty thought block in V2.")
return [think_content], post_think_content
else:
raise ValueError("Missing required <|begin_of_thought|> or <|begin_of_solution|> blocks.")
# ---------- Worker function for multithreading (Unchanged) ----------
def process_single_item(args):
item, dataset_name, tokenizer, format_tokenizer, max_prompt_token_len_config = args
try:
if dataset_name == "OpenR1-Math-220k":
problem = item["problem"].strip()
response_full = item["generations"][0].strip()
reasoning_blocks, answer = extract_think_sections(response_full)
elif dataset_name == "OpenThoughts-114k-math":
problem = item["problem"].strip()
response_full = item["conversations"][1]["value"].strip()
reasoning_blocks, answer = extract_think_and_solution_V2(response_full)
elif dataset_name == "reasoning-v1-20m":
problem = item.get("prompt", "").strip()
response_full = item.get("response", "").strip()
reasoning_blocks, answer = extract_think_sections(response_full)
elif dataset_name == "OpenThoughts-114k-Code_decontaminated":
problem = item["problem"].strip()
# response_full = item.get("response", "").strip()
# reasoning_blocks, answer = extract_think_sections(response_full)
reasoning_blocks = [item["deepseek_reasoning"]]
answer = item["deepseek_solution"]
elif dataset_name == "Medical-R1-Distill-Data":
problem = item["question"].strip()
# response_full = item.get("response", "").strip()
# reasoning_blocks, answer = extract_think_sections(response_full)
reasoning_blocks = [item["reasoning (reasoning_content)"]]
answer = item["response (content)"]
else:
return None
if not reasoning_blocks or not reasoning_blocks[0]:
return None
reasoning = reasoning_blocks[0].strip()
solution = answer.strip()
input_token_count = len(tokenizer.tokenize(problem))
output_token_count = len(tokenizer.tokenize(solution))
reasoning_token_count = len(tokenizer.tokenize(reasoning))
instruct_info = (
"Given a <Problem> and its corresponding <Solution>, your task is to predict how many tokens are consumed in the process of arriving at the final <Solution> to the problem. Generally speaking, the more complex the problem is, the more tokens are required.\n"
f"<Problem>\n{problem}\n</Problem>\n\n"
f"<Solution>\n{solution}\n</Solution>\n"
f"The Problem has {input_token_count} tokens, and the Solution has {output_token_count} tokens.\n\n\n"
"Please provide a detailed chain-of-thought reasoning process and include your thought process within <think> tags. "
"Your final answer should be enclosed within <answer> tags.\n\n"
"Please return the predicted number of tokens in JSON format: \n```json\n{\"count\": int}\n```\n\n"
"Example format:\n"
"<think> Step-by-step reasoning, including self-reflection and corrections if necessary. [Limited by 1024 tokens] </think>\n"
"<answer> Summary of the thought process leading to the final token count and your predicted token count in json format: \n```json\n{\"count\": int}\n```\n [Limited by 512 tokens]\n"
"</answer>\n\n"
"Let me solve this step by step.\n"
)
cot_info = "<think>"
messages = [
{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
{"role": "user", "content": instruct_info.strip()},
]
prompt = format_tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
prompt += cot_info
prompt_token_len = len(format_tokenizer.tokenize(prompt))
if prompt_token_len <= max_prompt_token_len_config - 10:
return {
"prompt": prompt,
"ground_truth": reasoning_token_count,
"data_source": dataset_name,
"prompt_token_len": prompt_token_len,
}
return None
except ValueError:
return None
except Exception as e:
print(f"Error processing item for {dataset_name}: {e}")
return None
# ---------- 参数设置 (Unchanged for original purpose, but test_size's role in splitting changes) ----------
# train_size = 100000 # This now primarily influences num_items_to_sample_raw
test_size = 1000 # This now primarily influences num_items_to_sample_raw; actual test split is fixed
max_prompt_token_len = 4096
random_seed = 42 # This is the fixed random seed
NUM_THREADS = 16
# Set the fixed random seed for Python's `random` module
# This will affect `random.sample` used for splitting
random.seed(random_seed)
# ---------- 加载并打乱数据集 (Logic largely unchanged, seed is used by dataset.shuffle) ----------
datasets_config = {
# "OpenR1-Math-220k": "/workspace/0407_nips/data_preprocess/OpenR1-Math-220k/data",
# "reasoning-v1-20m": "/workspace/0407_nips/data_preprocess/reasoning-v1-20m/data",
# "OpenThoughts-114k-math": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-math/data",
"OpenThoughts-114k-Code_decontaminated": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-Code_decontaminated/data",
# "Medical-R1-Distill-Data": "/workspace/0407_nips/data_preprocess/Medical-R1-Distill-Data"
}
for name, path in datasets_config.items():
print(f"Processing dataset: {name}")
try:
dataset_hf = load_dataset(path, trust_remote_code=True)["train"]
except Exception as e:
print(f"Error loading dataset {name} from {path}: {e}")
continue
print(len(dataset_hf))
# num_items_to_sample_raw = train_size + test_size + 1000
# actual_num_to_sample = min(num_items_to_sample_raw, len(dataset_hf))
# if actual_num_to_sample < num_items_to_sample_raw:
# print(f"Warning: Dataset {name} has only {len(dataset_hf)} items. Sampling {actual_num_to_sample} instead of {num_items_to_sample_raw}.")
# if actual_num_to_sample == 0:
# print(f"Skipping dataset {name} as it has no items or actual_num_to_sample is 0.")
# continue
# # Shuffling raw dataset with the fixed seed
# dataset_selected = dataset_hf.shuffle(seed=random_seed).select(range(actual_num_to_sample))
# .select(range(102000))
dataset_selected = dataset_hf.shuffle(seed=random_seed)
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
format_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
processed_item_data_list = []
tasks_args_list = []
print(f"Preparing tasks for {name}...")
for item in dataset_selected:
tasks_args_list.append((item, name, tokenizer, format_tokenizer, max_prompt_token_len))
print(f"Submitting {len(tasks_args_list)} tasks to thread pool for {name}...")
with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
future_to_item_args = {executor.submit(process_single_item, args): args for args in tasks_args_list}
for future in tqdm(concurrent.futures.as_completed(future_to_item_args), total=len(tasks_args_list), desc=f"Processing items for {name}"):
try:
result = future.result()
if result:
processed_item_data_list.append(result)
except Exception as exc:
# item_arg_tuple = future_to_item_args[future] # Uncomment if needed for debugging
print(f'Item generated an exception during future.result(): {exc}')
records = []
count = 0
print(f"Collected {len(processed_item_data_list)} valid processed items for {name}. Assigning IDs...")
# The original code had a cap here. We keep it.
# This `num_items_to_sample_raw` acts as an upper limit on total records considered for splitting.
for item_data in processed_item_data_list:
# if len(records) >= num_items_to_sample_raw:
# break
item_data["ids"] = f"{name}_{count}"
records.append(item_data)
count += 1
if not records:
print(f"No valid records generated for dataset {name} after filtering and ID assignment. Skipping saving.")
continue
# ---------- MODIFIED: 拆分训练集和测试集 ----------
# Test set is fixed at 1000 (or fewer if not enough data)
# Training set is everything else.
# random_seed is already set globally for `random` module.
target_test_set_size = 1000 # Your requirement
num_available_records = len(records)
train_records = []
test_records = []
if num_available_records == 0:
print(f"No records available for splitting for {name}.")
elif num_available_records <= target_test_set_size:
# If we have 1000 or fewer records, all go to test set, train is empty
print(f"Warning: Only {num_available_records} records available for {name}. All will be used for the test set.")
test_records = list(records) # Make a copy
train_records = []
else:
# We have more than 1000 records. Sample 1000 for test set.
# `random.sample` uses the seed set by `random.seed(random_seed)`
test_indices = sorted(random.sample(range(num_available_records), target_test_set_size))
current_test_idx_ptr = 0
for i in range(num_available_records):
if current_test_idx_ptr < len(test_indices) and i == test_indices[current_test_idx_ptr]:
test_records.append(records[i])
current_test_idx_ptr += 1
else:
train_records.append(records[i])
# Sanity check
if len(test_records) != target_test_set_size:
print(f"Error: Test set size mismatch. Expected {target_test_set_size}, got {len(test_records)}")
if len(train_records) != num_available_records - target_test_set_size:
print(f"Error: Train set size mismatch. Expected {num_available_records - target_test_set_size}, got {len(train_records)}")
# ---------- 保存 (Unchanged other than variable names if needed) ----------
if train_records:
df_train = pd.DataFrame(train_records)
df_train.to_parquet(f"v2_train_counting_dataset_{name}_{len(df_train)}.parquet", index=False)
else:
print(f"No training records to save for {name}.")
if test_records:
df_test = pd.DataFrame(test_records)
df_test.to_parquet(f"v2_test_counting_dataset_{name}_{len(df_test)}.parquet", index=False)
else:
print(f"No test records to save for {name}.")
print(f"✅ Successfully processed dataset {name}")
print(f" Saved Train samples: {len(train_records)}, Test samples: {len(test_records)}")
print("-" * 30)
print("All datasets processed.")