s1ghhh
/

predictive_auditing_data

Model card Files Files and versions Community

predictive_auditing_data / get_parquet_single.py

s1ghhh

Upload folder using huggingface_hub

98595da verified about 1 month ago

raw

history blame contribute delete

7.1 kB

	import random
	from datasets import load_dataset
	from transformers import AutoTokenizer
	import re
	from tqdm import tqdm
	import pandas as pd
	import json

	def extract_think_and_rest(text):
	"""提取 <think>...</think> 中的部分和剩余部分"""
	think_blocks = re.findall(r"<think>(.*?)</think>", text, flags=re.DOTALL)
	last_think_end = 0
	for match in re.finditer(r"</think>", text):
	last_think_end = match.end()
	rest_text = text[last_think_end:].strip() if last_think_end else text.strip()
	return think_blocks, rest_text

	def extract_think_sections(text: str):
	think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
	if think_match:
	think_content = think_match.group(1).strip()
	end_pos = think_match.end()
	post_think_content = text[end_pos:].strip()
	else:
	think_content = None
	post_think_content = text.strip()
	raise ValueError
	return think_content, post_think_content


	def extract_think_and_solution_V2(text: str):
	pattern = (
	r"<\\|begin_of_thought\\|>(.?)<\\|end_of_thought\\|>\s"
	r"<\\|begin_of_solution\\|>(.*?)<\\|end_of_solution\\|>"
	)
	match = re.search(pattern, text, re.DOTALL)
	if match:
	think_content = match.group(1).strip()
	post_think_content = match.group(2).strip()
	else:
	think_content = None
	post_think_content = text.strip()
	raise ValueError("Missing required <\|begin_of_thought\|> or <\|begin_of_solution\|> blocks.")
	return think_content, post_think_content


	# ---------- 参数设置 ----------
	train_size = 110000 # 训练集样本数量
	test_size = 1000 # 测试集样本数量
	max_prompt_token_len = 2048
	random_seed = 42
	num_workers = 16
	# ---------- 加载并打乱数据集 ----------

	datasets_config = {
	"OpenR1-Math-220k": "/workspace/0407_nips/data_preprocess/OpenR1-Math-220k/data",
	# "reasoning-v1-20m": "/workspace/0407_nips/data_preprocess/reasoning-v1-20m/data",
	"OpenThoughts-114k-math": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-math/data",
	# "OpenThoughts-114k-Code_decontaminated": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-Code_decontaminated/data",
	# "Medical-R1-Distill-Data": "/workspace/0407_nips/data_preprocess/Medical-R1-Distill-Data"
	}

	for name, path in datasets_config.items():
	print(f"{name}")
	dataset = load_dataset(path)["train"]
	# shuffled_dataset = dataset.shuffle(seed=42)

	# dataset = load_dataset('/workspace/0407_nips/data_preprocess/reasoning-v1-20m/data')["train"]
	total_size = train_size + test_size + 1000
	dataset = dataset.shuffle(seed=random_seed).select(range(total_size))

	# ---------- 初始化 Tokenizer ----------
	tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
	format_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")

	records = []
	count = 0

	# ---------- 处理样本 ----------
	for item in tqdm(dataset, desc="Processing samples"):
	if len(records) >= total_size:
	break
	if name == "OpenR1-Math-220k":
	problem = item["problem"].strip()
	response_full = item["generations"][0].strip()

	reasoning_blocks, answer = extract_think_sections(response_full)
	if not reasoning_blocks:
	continue
	reasoning = reasoning_blocks[0].strip()
	solution = answer.strip()
	elif name == "OpenThoughts-114k-math":
	problem = item["problem"].strip()
	response_full = item["conversations"][1]["value"].strip()

	reasoning_blocks, answer = extract_think_and_solution_V2(response_full)
	if not reasoning_blocks:
	continue
	reasoning = reasoning_blocks[0].strip()
	solution = answer.strip()
	elif name == "reasoning-v1-20m":
	problem = item.get("prompt", "").strip()
	response_full = item.get("response", "").strip()

	reasoning_blocks, answer = extract_think_sections(response_full)
	if not reasoning_blocks:
	continue
	reasoning = reasoning_blocks[0].strip()
	solution = answer.strip()

	input_token_count = len(tokenizer.tokenize(problem))
	output_token_count = len(tokenizer.tokenize(solution))
	reasoning_token_count = len(tokenizer.tokenize(reasoning))

	instruct_info = (
	"Your task is to estimate how many tokens a reasoning model would use to solve the following Problem and Solution.\n"
	"Please return the predicted number of tokens in JSON format: ```json\n{\"count\": int}\n```\n\n"
	f"<Problem>\n{problem}\n</Problem>\n\n"
	f"<Solution>\n{solution}\n</Solution>\n"
	f"The Problem has {input_token_count} tokens, and the Solution has {output_token_count} tokens.\n\n\n"
	"Please provide a detailed chain-of-thought reasoning process and include your thought process within `<think>` tags. "
	"Your final answer should be enclosed within `<answer>` tags.\n\n"
	"Ensure that your counting logic is sound and that your explanation is token-efficient.\n\n"
	"Example format:\n"
	"<think> Step-by-step reasoning, including self-reflection and corrections if necessary. [Limited by 1024 tokens] </think>\n"
	"<answer> Summary of the thought process leading to the final token count and your predicted token count in json format: ```json\n{\"count\": int}\n``` [Limited by 512 tokens]\n"
	"</answer>\n\n"
	)

	cot_info = "Let me solve this step by step.\n"

	messages = [
	{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
	{"role": "user", "content": instruct_info.strip()},
	]

	prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
	prompt += cot_info

	prompt_token_len = len(format_tokenizer.tokenize(prompt))

	if prompt_token_len <= max_prompt_token_len - 10:
	records.append({
	"prompt": prompt,
	"ground_truth": reasoning_token_count,
	"data_source": name,
	"ids": f"{name}_{count}",
	"prompt_token_len": prompt_token_len,
	})
	count += 1

	if len(records) >= total_size:
	break

	# ---------- 拆分训练集和测试集 ----------
	train_records = records[:train_size]
	test_records = records[train_size:train_size + test_size]

	# ---------- 保存 ----------
	df_train = pd.DataFrame(train_records)
	df_test = pd.DataFrame(test_records)

	df_train.to_parquet(f"train_counting_dataset_{name}_{train_size}.parquet", index=False)
	df_test.to_parquet(f"test_counting_dataset_{name}_{test_size}.parquet", index=False)

	print("✅ 成功生成训练和测试数据集")
	print(f"Train samples: {len(df_train)}, Test samples: {len(df_test)}")