s1ghhh
/

predictive_auditing_data

Model card Files Files and versions Community

predictive_auditing_data / get_parquet.py

s1ghhh

Upload folder using huggingface_hub

98595da verified about 1 month ago

raw

history blame contribute delete

12.6 kB

	import random
	from datasets import load_dataset
	from transformers import AutoTokenizer
	import re
	from tqdm import tqdm
	import pandas as pd
	import json
	import concurrent.futures

	# ---------- Helper Functions (Unchanged) ----------
	def extract_think_and_rest(text):
	"""提取 <think>...</think> 中的部分和剩余部分"""
	think_blocks = re.findall(r"<think>(.*?)</think>", text, flags=re.DOTALL)
	last_think_end = 0
	for match in re.finditer(r"</think>", text):
	last_think_end = match.end()
	rest_text = text[last_think_end:].strip() if last_think_end else text.strip()
	return think_blocks, rest_text

	def extract_think_sections(text: str):
	think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
	if think_match:
	think_content = think_match.group(1).strip()
	end_pos = think_match.end()
	post_think_content = text[end_pos:].strip()
	if not think_content: # if <think></think>
	raise ValueError("Empty think block")
	return [think_content], post_think_content
	else:
	raise ValueError("Missing <think> block.")

	def extract_think_and_solution_V2(text: str):
	pattern = (
	r"<\\|begin_of_thought\\|>(.?)<\\|end_of_thought\\|>\s"
	r"<\\|begin_of_solution\\|>(.*?)<\\|end_of_solution\\|>"
	)
	match = re.search(pattern, text, re.DOTALL)
	if match:
	think_content = match.group(1).strip()
	post_think_content = match.group(2).strip()
	if not think_content:
	raise ValueError("Empty thought block in V2.")
	return [think_content], post_think_content
	else:
	raise ValueError("Missing required <\|begin_of_thought\|> or <\|begin_of_solution\|> blocks.")

	# ---------- Worker function for multithreading (Unchanged) ----------
	def process_single_item(args):
	item, dataset_name, tokenizer, format_tokenizer, max_prompt_token_len_config = args
	try:
	if dataset_name == "OpenR1-Math-220k":
	problem = item["problem"].strip()
	response_full = item["generations"][0].strip()
	reasoning_blocks, answer = extract_think_sections(response_full)
	elif dataset_name == "OpenThoughts-114k-math":
	problem = item["problem"].strip()
	response_full = item["conversations"][1]["value"].strip()
	reasoning_blocks, answer = extract_think_and_solution_V2(response_full)
	elif dataset_name == "reasoning-v1-20m":
	problem = item.get("prompt", "").strip()
	response_full = item.get("response", "").strip()
	reasoning_blocks, answer = extract_think_sections(response_full)
	elif dataset_name == "OpenThoughts-114k-Code_decontaminated":
	problem = item["problem"].strip()
	# response_full = item.get("response", "").strip()
	# reasoning_blocks, answer = extract_think_sections(response_full)
	reasoning_blocks = [item["deepseek_reasoning"]]
	answer = item["deepseek_solution"]
	elif dataset_name == "Medical-R1-Distill-Data":
	problem = item["question"].strip()
	# response_full = item.get("response", "").strip()
	# reasoning_blocks, answer = extract_think_sections(response_full)
	reasoning_blocks = [item["reasoning (reasoning_content)"]]
	answer = item["response (content)"]
	else:
	return None

	if not reasoning_blocks or not reasoning_blocks[0]:
	return None

	reasoning = reasoning_blocks[0].strip()
	solution = answer.strip()

	input_token_count = len(tokenizer.tokenize(problem))
	output_token_count = len(tokenizer.tokenize(solution))
	reasoning_token_count = len(tokenizer.tokenize(reasoning))

	instruct_info = (
	"Given a <Problem> and its corresponding <Solution>, your task is to predict how many tokens are consumed in the process of arriving at the final <Solution> to the problem. Generally speaking, the more complex the problem is, the more tokens are required.\n"
	f"<Problem>\n{problem}\n</Problem>\n\n"
	f"<Solution>\n{solution}\n</Solution>\n"
	f"The Problem has {input_token_count} tokens, and the Solution has {output_token_count} tokens.\n\n\n"
	"Please provide a detailed chain-of-thought reasoning process and include your thought process within <think> tags. "
	"Your final answer should be enclosed within <answer> tags.\n\n"
	"Please return the predicted number of tokens in JSON format: \n```json\n{\"count\": int}\n```\n\n"
	"Example format:\n"
	"<think> Step-by-step reasoning, including self-reflection and corrections if necessary. [Limited by 1024 tokens] </think>\n"
	"<answer> Summary of the thought process leading to the final token count and your predicted token count in json format: \n```json\n{\"count\": int}\n```\n [Limited by 512 tokens]\n"
	"</answer>\n\n"
	"Let me solve this step by step.\n"
	)

	cot_info = "<think>"

	messages = [
	{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
	{"role": "user", "content": instruct_info.strip()},
	]

	prompt = format_tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
	prompt += cot_info

	prompt_token_len = len(format_tokenizer.tokenize(prompt))

	if prompt_token_len <= max_prompt_token_len_config - 10:
	return {
	"prompt": prompt,
	"ground_truth": reasoning_token_count,
	"data_source": dataset_name,
	"prompt_token_len": prompt_token_len,
	}
	return None
	except ValueError:
	return None
	except Exception as e:
	print(f"Error processing item for {dataset_name}: {e}")
	return None

	# ---------- 参数设置 (Unchanged for original purpose, but test_size's role in splitting changes) ----------
	# train_size = 100000 # This now primarily influences num_items_to_sample_raw
	test_size = 1000 # This now primarily influences num_items_to_sample_raw; actual test split is fixed
	max_prompt_token_len = 4096
	random_seed = 42 # This is the fixed random seed
	NUM_THREADS = 16

	# Set the fixed random seed for Python's `random` module
	# This will affect `random.sample` used for splitting
	random.seed(random_seed)

	# ---------- 加载并打乱数据集 (Logic largely unchanged, seed is used by dataset.shuffle) ----------
	datasets_config = {
	# "OpenR1-Math-220k": "/workspace/0407_nips/data_preprocess/OpenR1-Math-220k/data",
	# "reasoning-v1-20m": "/workspace/0407_nips/data_preprocess/reasoning-v1-20m/data",
	# "OpenThoughts-114k-math": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-math/data",
	"OpenThoughts-114k-Code_decontaminated": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-Code_decontaminated/data",
	# "Medical-R1-Distill-Data": "/workspace/0407_nips/data_preprocess/Medical-R1-Distill-Data"
	}

	for name, path in datasets_config.items():
	print(f"Processing dataset: {name}")
	try:
	dataset_hf = load_dataset(path, trust_remote_code=True)["train"]
	except Exception as e:
	print(f"Error loading dataset {name} from {path}: {e}")
	continue
	print(len(dataset_hf))
	# num_items_to_sample_raw = train_size + test_size + 1000

	# actual_num_to_sample = min(num_items_to_sample_raw, len(dataset_hf))
	# if actual_num_to_sample < num_items_to_sample_raw:
	# print(f"Warning: Dataset {name} has only {len(dataset_hf)} items. Sampling {actual_num_to_sample} instead of {num_items_to_sample_raw}.")

	# if actual_num_to_sample == 0:
	# print(f"Skipping dataset {name} as it has no items or actual_num_to_sample is 0.")
	# continue

	# # Shuffling raw dataset with the fixed seed
	# dataset_selected = dataset_hf.shuffle(seed=random_seed).select(range(actual_num_to_sample))
	# .select(range(102000))
	dataset_selected = dataset_hf.shuffle(seed=random_seed)

	tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
	format_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")

	processed_item_data_list = []
	tasks_args_list = []

	print(f"Preparing tasks for {name}...")
	for item in dataset_selected:
	tasks_args_list.append((item, name, tokenizer, format_tokenizer, max_prompt_token_len))

	print(f"Submitting {len(tasks_args_list)} tasks to thread pool for {name}...")
	with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
	future_to_item_args = {executor.submit(process_single_item, args): args for args in tasks_args_list}

	for future in tqdm(concurrent.futures.as_completed(future_to_item_args), total=len(tasks_args_list), desc=f"Processing items for {name}"):
	try:
	result = future.result()
	if result:
	processed_item_data_list.append(result)
	except Exception as exc:
	# item_arg_tuple = future_to_item_args[future] # Uncomment if needed for debugging
	print(f'Item generated an exception during future.result(): {exc}')


	records = []
	count = 0
	print(f"Collected {len(processed_item_data_list)} valid processed items for {name}. Assigning IDs...")
	# The original code had a cap here. We keep it.
	# This `num_items_to_sample_raw` acts as an upper limit on total records considered for splitting.
	for item_data in processed_item_data_list:
	# if len(records) >= num_items_to_sample_raw:
	# break
	item_data["ids"] = f"{name}_{count}"
	records.append(item_data)
	count += 1

	if not records:
	print(f"No valid records generated for dataset {name} after filtering and ID assignment. Skipping saving.")
	continue

	# ---------- MODIFIED: 拆分训练集和测试集 ----------
	# Test set is fixed at 1000 (or fewer if not enough data)
	# Training set is everything else.
	# random_seed is already set globally for `random` module.

	target_test_set_size = 1000 # Your requirement
	num_available_records = len(records)

	train_records = []
	test_records = []

	if num_available_records == 0:
	print(f"No records available for splitting for {name}.")
	elif num_available_records <= target_test_set_size:
	# If we have 1000 or fewer records, all go to test set, train is empty
	print(f"Warning: Only {num_available_records} records available for {name}. All will be used for the test set.")
	test_records = list(records) # Make a copy
	train_records = []
	else:
	# We have more than 1000 records. Sample 1000 for test set.
	# `random.sample` uses the seed set by `random.seed(random_seed)`
	test_indices = sorted(random.sample(range(num_available_records), target_test_set_size))

	current_test_idx_ptr = 0
	for i in range(num_available_records):
	if current_test_idx_ptr < len(test_indices) and i == test_indices[current_test_idx_ptr]:
	test_records.append(records[i])
	current_test_idx_ptr += 1
	else:
	train_records.append(records[i])

	# Sanity check
	if len(test_records) != target_test_set_size:
	print(f"Error: Test set size mismatch. Expected {target_test_set_size}, got {len(test_records)}")
	if len(train_records) != num_available_records - target_test_set_size:
	print(f"Error: Train set size mismatch. Expected {num_available_records - target_test_set_size}, got {len(train_records)}")


	# ---------- 保存 (Unchanged other than variable names if needed) ----------
	if train_records:
	df_train = pd.DataFrame(train_records)
	df_train.to_parquet(f"v2_train_counting_dataset_{name}_{len(df_train)}.parquet", index=False)
	else:
	print(f"No training records to save for {name}.")

	if test_records:
	df_test = pd.DataFrame(test_records)
	df_test.to_parquet(f"v2_test_counting_dataset_{name}_{len(df_test)}.parquet", index=False)
	else:
	print(f"No test records to save for {name}.")

	print(f"✅ Successfully processed dataset {name}")
	print(f" Saved Train samples: {len(train_records)}, Test samples: {len(test_records)}")
	print("-" * 30)

	print("All datasets processed.")