predictive_auditing_data / get_parquet_single.py
s1ghhh's picture
Upload folder using huggingface_hub
98595da verified
import random
from datasets import load_dataset
from transformers import AutoTokenizer
import re
from tqdm import tqdm
import pandas as pd
import json
def extract_think_and_rest(text):
"""提取 <think>...</think> 中的部分和剩余部分"""
think_blocks = re.findall(r"<think>(.*?)</think>", text, flags=re.DOTALL)
last_think_end = 0
for match in re.finditer(r"</think>", text):
last_think_end = match.end()
rest_text = text[last_think_end:].strip() if last_think_end else text.strip()
return think_blocks, rest_text
def extract_think_sections(text: str):
think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
if think_match:
think_content = think_match.group(1).strip()
end_pos = think_match.end()
post_think_content = text[end_pos:].strip()
else:
think_content = None
post_think_content = text.strip()
raise ValueError
return think_content, post_think_content
def extract_think_and_solution_V2(text: str):
pattern = (
r"<\|begin_of_thought\|>(.*?)<\|end_of_thought\|>\s*"
r"<\|begin_of_solution\|>(.*?)<\|end_of_solution\|>"
)
match = re.search(pattern, text, re.DOTALL)
if match:
think_content = match.group(1).strip()
post_think_content = match.group(2).strip()
else:
think_content = None
post_think_content = text.strip()
raise ValueError("Missing required <|begin_of_thought|> or <|begin_of_solution|> blocks.")
return think_content, post_think_content
# ---------- 参数设置 ----------
train_size = 110000 # 训练集样本数量
test_size = 1000 # 测试集样本数量
max_prompt_token_len = 2048
random_seed = 42
num_workers = 16
# ---------- 加载并打乱数据集 ----------
datasets_config = {
"OpenR1-Math-220k": "/workspace/0407_nips/data_preprocess/OpenR1-Math-220k/data",
# "reasoning-v1-20m": "/workspace/0407_nips/data_preprocess/reasoning-v1-20m/data",
"OpenThoughts-114k-math": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-math/data",
# "OpenThoughts-114k-Code_decontaminated": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-Code_decontaminated/data",
# "Medical-R1-Distill-Data": "/workspace/0407_nips/data_preprocess/Medical-R1-Distill-Data"
}
for name, path in datasets_config.items():
print(f"{name}")
dataset = load_dataset(path)["train"]
# shuffled_dataset = dataset.shuffle(seed=42)
# dataset = load_dataset('/workspace/0407_nips/data_preprocess/reasoning-v1-20m/data')["train"]
total_size = train_size + test_size + 1000
dataset = dataset.shuffle(seed=random_seed).select(range(total_size))
# ---------- 初始化 Tokenizer ----------
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
format_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
records = []
count = 0
# ---------- 处理样本 ----------
for item in tqdm(dataset, desc="Processing samples"):
if len(records) >= total_size:
break
if name == "OpenR1-Math-220k":
problem = item["problem"].strip()
response_full = item["generations"][0].strip()
reasoning_blocks, answer = extract_think_sections(response_full)
if not reasoning_blocks:
continue
reasoning = reasoning_blocks[0].strip()
solution = answer.strip()
elif name == "OpenThoughts-114k-math":
problem = item["problem"].strip()
response_full = item["conversations"][1]["value"].strip()
reasoning_blocks, answer = extract_think_and_solution_V2(response_full)
if not reasoning_blocks:
continue
reasoning = reasoning_blocks[0].strip()
solution = answer.strip()
elif name == "reasoning-v1-20m":
problem = item.get("prompt", "").strip()
response_full = item.get("response", "").strip()
reasoning_blocks, answer = extract_think_sections(response_full)
if not reasoning_blocks:
continue
reasoning = reasoning_blocks[0].strip()
solution = answer.strip()
input_token_count = len(tokenizer.tokenize(problem))
output_token_count = len(tokenizer.tokenize(solution))
reasoning_token_count = len(tokenizer.tokenize(reasoning))
instruct_info = (
"Your task is to estimate how many tokens a reasoning model would use to solve the following Problem and Solution.\n"
"Please return the predicted number of tokens in JSON format: ```json\n{\"count\": int}\n```\n\n"
f"<Problem>\n{problem}\n</Problem>\n\n"
f"<Solution>\n{solution}\n</Solution>\n"
f"The Problem has {input_token_count} tokens, and the Solution has {output_token_count} tokens.\n\n\n"
"Please provide a detailed chain-of-thought reasoning process and include your thought process within `<think>` tags. "
"Your final answer should be enclosed within `<answer>` tags.\n\n"
"Ensure that your counting logic is sound and that your explanation is token-efficient.\n\n"
"Example format:\n"
"<think> Step-by-step reasoning, including self-reflection and corrections if necessary. [Limited by 1024 tokens] </think>\n"
"<answer> Summary of the thought process leading to the final token count and your predicted token count in json format: ```json\n{\"count\": int}\n``` [Limited by 512 tokens]\n"
"</answer>\n\n"
)
cot_info = "Let me solve this step by step.\n"
messages = [
{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
{"role": "user", "content": instruct_info.strip()},
]
prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
prompt += cot_info
prompt_token_len = len(format_tokenizer.tokenize(prompt))
if prompt_token_len <= max_prompt_token_len - 10:
records.append({
"prompt": prompt,
"ground_truth": reasoning_token_count,
"data_source": name,
"ids": f"{name}_{count}",
"prompt_token_len": prompt_token_len,
})
count += 1
if len(records) >= total_size:
break
# ---------- 拆分训练集和测试集 ----------
train_records = records[:train_size]
test_records = records[train_size:train_size + test_size]
# ---------- 保存 ----------
df_train = pd.DataFrame(train_records)
df_test = pd.DataFrame(test_records)
df_train.to_parquet(f"train_counting_dataset_{name}_{train_size}.parquet", index=False)
df_test.to_parquet(f"test_counting_dataset_{name}_{test_size}.parquet", index=False)
print("✅ 成功生成训练和测试数据集")
print(f"Train samples: {len(df_train)}, Test samples: {len(df_test)}")