s1ghhh
/

predictive_auditing_data

Model card Files Files and versions Community

s1ghhh commited on Jun 13

Commit

98595da

verified ·

1 Parent(s): 390a971

Upload folder using huggingface_hub

Browse files

Files changed (16) hide show

check_length.py +29 -0
get_parquet.py +267 -0
get_parquet_single.py +167 -0
load_check.py +25 -0
long_prompts.json +1 -0
merge_train.py +44 -0
v2_test_counting_dataset_Medical-R1-Distill-Data_1000.parquet +3 -0
v2_test_counting_dataset_OpenR1-Math-220k_1000.parquet +3 -0
v2_test_counting_dataset_OpenThoughts-114k-Code_decontaminated_1000.parquet +3 -0
v2_test_counting_dataset_OpenThoughts-114k-math_1000.parquet +3 -0
v2_test_counting_dataset_reasoning-v1-20m_1000.parquet +3 -0
v2_train_counting_dataset_Medical-R1-Distill-Data_21000.parquet +3 -0
v2_train_counting_dataset_OpenR1-Math-220k_90258.parquet +3 -0
v2_train_counting_dataset_OpenThoughts-114k-Code_decontaminated_15372.parquet +3 -0
v2_train_counting_dataset_OpenThoughts-114k-math_88120.parquet +3 -0
v2_train_counting_dataset_reasoning-v1-20m_100982.parquet +3 -0

check_length.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from datasets import load_dataset
+from transformers import AutoTokenizer
+# 设置模型 tokenizer（例如使用 GPT-2）
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
+# 加载 parquet 数据集（假设文件名为 data.parquet）
+dataset = load_dataset("parquet", data_files="/workspace/0525_zyw/verl/counting/mk_data/counting_dataset_qwen25_max2048.parquet")
+# 选择其中一个 split，例如 'train' 或默认的 'train'
+data = dataset["train"]
+# 记录超过 2048 tokens 的样本索引和内容
+long_items = []
+for idx, example in enumerate(data):
+    prompt = example.get("prompt", "")
+    tokens = tokenizer(prompt, truncation=False, return_tensors="pt")
+    input_len = tokens.input_ids.shape[1]
+    if input_len > 2048:
+        long_items.append({"index": idx, "length": input_len, "prompt": prompt})
+print(f"Found {len(long_items)} items with more than 2048 tokens.")
+# 可选：保存结果到 JSON 文件
+import json
+with open("long_prompts.json", "w", encoding="utf-8") as f:
+    json.dump(long_items, f, ensure_ascii=False, indent=2)

get_parquet.py ADDED Viewed

	@@ -0,0 +1,267 @@

+import random
+from datasets import load_dataset
+from transformers import AutoTokenizer
+import re
+from tqdm import tqdm
+import pandas as pd
+import json
+import concurrent.futures
+# ---------- Helper Functions (Unchanged) ----------
+def extract_think_and_rest(text):
+    """提取 <think>...</think> 中的部分和剩余部分"""
+    think_blocks = re.findall(r"<think>(.*?)</think>", text, flags=re.DOTALL)
+    last_think_end = 0
+    for match in re.finditer(r"</think>", text):
+        last_think_end = match.end()
+    rest_text = text[last_think_end:].strip() if last_think_end else text.strip()
+    return think_blocks, rest_text
+def extract_think_sections(text: str):
+    think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
+    if think_match:
+        think_content = think_match.group(1).strip()
+        end_pos = think_match.end()
+        post_think_content = text[end_pos:].strip()
+        if not think_content: # if <think></think>
+             raise ValueError("Empty think block")
+        return [think_content], post_think_content
+    else:
+        raise ValueError("Missing <think> block.")
+def extract_think_and_solution_V2(text: str):
+    pattern = (
+        r"<\|begin_of_thought\|>(.*?)<\|end_of_thought\|>\s*"
+        r"<\|begin_of_solution\|>(.*?)<\|end_of_solution\|>"
+    )
+    match = re.search(pattern, text, re.DOTALL)
+    if match:
+        think_content = match.group(1).strip()
+        post_think_content = match.group(2).strip()
+        if not think_content:
+            raise ValueError("Empty thought block in V2.")
+        return [think_content], post_think_content
+    else:
+        raise ValueError("Missing required <|begin_of_thought|> or <|begin_of_solution|> blocks.")
+# ---------- Worker function for multithreading (Unchanged) ----------
+def process_single_item(args):
+    item, dataset_name, tokenizer, format_tokenizer, max_prompt_token_len_config = args
+    try:
+        if dataset_name == "OpenR1-Math-220k":
+            problem = item["problem"].strip()
+            response_full = item["generations"][0].strip()
+            reasoning_blocks, answer = extract_think_sections(response_full)
+        elif dataset_name == "OpenThoughts-114k-math":
+            problem = item["problem"].strip()
+            response_full = item["conversations"][1]["value"].strip()
+            reasoning_blocks, answer = extract_think_and_solution_V2(response_full)
+        elif dataset_name == "reasoning-v1-20m":
+            problem = item.get("prompt", "").strip()
+            response_full = item.get("response", "").strip()
+            reasoning_blocks, answer = extract_think_sections(response_full)
+        elif dataset_name == "OpenThoughts-114k-Code_decontaminated":
+            problem = item["problem"].strip()
+            # response_full = item.get("response", "").strip()
+            # reasoning_blocks, answer = extract_think_sections(response_full)
+            reasoning_blocks = [item["deepseek_reasoning"]]
+            answer = item["deepseek_solution"]
+        elif dataset_name == "Medical-R1-Distill-Data":
+            problem = item["question"].strip()
+            # response_full = item.get("response", "").strip()
+            # reasoning_blocks, answer = extract_think_sections(response_full)
+            reasoning_blocks = [item["reasoning (reasoning_content)"]]
+            answer = item["response (content)"]
+        else:
+            return None
+        if not reasoning_blocks or not reasoning_blocks[0]:
+            return None
+        reasoning = reasoning_blocks[0].strip()
+        solution = answer.strip()
+        input_token_count = len(tokenizer.tokenize(problem))
+        output_token_count = len(tokenizer.tokenize(solution))
+        reasoning_token_count = len(tokenizer.tokenize(reasoning))
+        instruct_info = (
+            "Given a <Problem> and its corresponding <Solution>, your task is to predict how many tokens are consumed in the process of arriving at the final <Solution> to the problem. Generally speaking, the more complex the problem is, the more tokens are required.\n"
+            f"<Problem>\n{problem}\n</Problem>\n\n"
+            f"<Solution>\n{solution}\n</Solution>\n"
+            f"The Problem has {input_token_count} tokens, and the Solution has {output_token_count} tokens.\n\n\n"
+            "Please provide a detailed chain-of-thought reasoning process and include your thought process within <think> tags. "
+            "Your final answer should be enclosed within <answer> tags.\n\n"
+            "Please return the predicted number of tokens in JSON format: \n```json\n{\"count\": int}\n```\n\n"
+            "Example format:\n"
+            "<think> Step-by-step reasoning, including self-reflection and corrections if necessary. [Limited by 1024 tokens] </think>\n"
+            "<answer> Summary of the thought process leading to the final token count and your predicted token count in json format: \n```json\n{\"count\": int}\n```\n [Limited by 512 tokens]\n"
+            "</answer>\n\n"
+            "Let me solve this step by step.\n"
+        )
+        cot_info = "<think>"
+        messages = [
+            {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
+            {"role": "user", "content": instruct_info.strip()},
+        ]
+        prompt = format_tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+        prompt += cot_info
+        prompt_token_len = len(format_tokenizer.tokenize(prompt))
+        if prompt_token_len <= max_prompt_token_len_config - 10:
+            return {
+                "prompt": prompt,
+                "ground_truth": reasoning_token_count,
+                "data_source": dataset_name,
+                "prompt_token_len": prompt_token_len,
+            }
+        return None
+    except ValueError:
+        return None
+    except Exception as e:
+        print(f"Error processing item for {dataset_name}: {e}")
+        return None
+# ---------- 参数设置 (Unchanged for original purpose, but test_size's role in splitting changes) ----------
+# train_size = 100000  # This now primarily influences num_items_to_sample_raw
+test_size = 1000    # This now primarily influences num_items_to_sample_raw; actual test split is fixed
+max_prompt_token_len = 4096
+random_seed = 42 # This is the fixed random seed
+NUM_THREADS = 16
+# Set the fixed random seed for Python's `random` module
+# This will affect `random.sample` used for splitting
+random.seed(random_seed)
+# ---------- 加载并打乱数据集 (Logic largely unchanged, seed is used by dataset.shuffle) ----------
+datasets_config = {
+    # "OpenR1-Math-220k": "/workspace/0407_nips/data_preprocess/OpenR1-Math-220k/data",
+    # "reasoning-v1-20m": "/workspace/0407_nips/data_preprocess/reasoning-v1-20m/data",
+    # "OpenThoughts-114k-math": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-math/data",
+    "OpenThoughts-114k-Code_decontaminated": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-Code_decontaminated/data",
+    # "Medical-R1-Distill-Data": "/workspace/0407_nips/data_preprocess/Medical-R1-Distill-Data"
+}
+for name, path in datasets_config.items():
+    print(f"Processing dataset: {name}")
+    try:
+        dataset_hf = load_dataset(path, trust_remote_code=True)["train"]
+    except Exception as e:
+        print(f"Error loading dataset {name} from {path}: {e}")
+        continue
+    print(len(dataset_hf))
+    # num_items_to_sample_raw = train_size + test_size + 1000
+    # actual_num_to_sample = min(num_items_to_sample_raw, len(dataset_hf))
+    # if actual_num_to_sample < num_items_to_sample_raw:
+    #     print(f"Warning: Dataset {name} has only {len(dataset_hf)} items. Sampling {actual_num_to_sample} instead of {num_items_to_sample_raw}.")
+    # if actual_num_to_sample == 0:
+    #     print(f"Skipping dataset {name} as it has no items or actual_num_to_sample is 0.")
+    #     continue
+    # # Shuffling raw dataset with the fixed seed
+    # dataset_selected = dataset_hf.shuffle(seed=random_seed).select(range(actual_num_to_sample))
+    # .select(range(102000))
+    dataset_selected = dataset_hf.shuffle(seed=random_seed)
+    tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
+    format_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
+    processed_item_data_list = []
+    tasks_args_list = []
+    print(f"Preparing tasks for {name}...")
+    for item in dataset_selected:
+        tasks_args_list.append((item, name, tokenizer, format_tokenizer, max_prompt_token_len))
+    print(f"Submitting {len(tasks_args_list)} tasks to thread pool for {name}...")
+    with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
+        future_to_item_args = {executor.submit(process_single_item, args): args for args in tasks_args_list}
+        for future in tqdm(concurrent.futures.as_completed(future_to_item_args), total=len(tasks_args_list), desc=f"Processing items for {name}"):
+            try:
+                result = future.result()
+                if result:
+                    processed_item_data_list.append(result)
+            except Exception as exc:
+                # item_arg_tuple = future_to_item_args[future] # Uncomment if needed for debugging
+                print(f'Item generated an exception during future.result(): {exc}')
+    records = []
+    count = 0
+    print(f"Collected {len(processed_item_data_list)} valid processed items for {name}. Assigning IDs...")
+    # The original code had a cap here. We keep it.
+    # This `num_items_to_sample_raw` acts as an upper limit on total records considered for splitting.
+    for item_data in processed_item_data_list:
+        # if len(records) >= num_items_to_sample_raw:
+        #     break
+        item_data["ids"] = f"{name}_{count}"
+        records.append(item_data)
+        count += 1
+    if not records:
+        print(f"No valid records generated for dataset {name} after filtering and ID assignment. Skipping saving.")
+        continue
+    # ---------- MODIFIED: 拆分训练集和测试集 ----------
+    # Test set is fixed at 1000 (or fewer if not enough data)
+    # Training set is everything else.
+    # random_seed is already set globally for `random` module.
+    target_test_set_size = 1000 # Your requirement
+    num_available_records = len(records)
+    train_records = []
+    test_records = []
+    if num_available_records == 0:
+        print(f"No records available for splitting for {name}.")
+    elif num_available_records <= target_test_set_size:
+        # If we have 1000 or fewer records, all go to test set, train is empty
+        print(f"Warning: Only {num_available_records} records available for {name}. All will be used for the test set.")
+        test_records = list(records) # Make a copy
+        train_records = []
+    else:
+        # We have more than 1000 records. Sample 1000 for test set.
+        # `random.sample` uses the seed set by `random.seed(random_seed)`
+        test_indices = sorted(random.sample(range(num_available_records), target_test_set_size))
+        current_test_idx_ptr = 0
+        for i in range(num_available_records):
+            if current_test_idx_ptr < len(test_indices) and i == test_indices[current_test_idx_ptr]:
+                test_records.append(records[i])
+                current_test_idx_ptr += 1
+            else:
+                train_records.append(records[i])
+        # Sanity check
+        if len(test_records) != target_test_set_size:
+             print(f"Error: Test set size mismatch. Expected {target_test_set_size}, got {len(test_records)}")
+        if len(train_records) != num_available_records - target_test_set_size:
+             print(f"Error: Train set size mismatch. Expected {num_available_records - target_test_set_size}, got {len(train_records)}")
+    # ---------- 保存 (Unchanged other than variable names if needed) ----------
+    if train_records:
+        df_train = pd.DataFrame(train_records)
+        df_train.to_parquet(f"v2_train_counting_dataset_{name}_{len(df_train)}.parquet", index=False)
+    else:
+        print(f"No training records to save for {name}.")
+    if test_records:
+        df_test = pd.DataFrame(test_records)
+        df_test.to_parquet(f"v2_test_counting_dataset_{name}_{len(df_test)}.parquet", index=False)
+    else:
+        print(f"No test records to save for {name}.")
+    print(f"✅ Successfully processed dataset {name}")
+    print(f"  Saved Train samples: {len(train_records)}, Test samples: {len(test_records)}")
+    print("-" * 30)
+print("All datasets processed.")

get_parquet_single.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import random
+from datasets import load_dataset
+from transformers import AutoTokenizer
+import re
+from tqdm import tqdm
+import pandas as pd
+import json
+def extract_think_and_rest(text):
+    """提取 <think>...</think> 中的部分和剩余部分"""
+    think_blocks = re.findall(r"<think>(.*?)</think>", text, flags=re.DOTALL)
+    last_think_end = 0
+    for match in re.finditer(r"</think>", text):
+        last_think_end = match.end()
+    rest_text = text[last_think_end:].strip() if last_think_end else text.strip()
+    return think_blocks, rest_text
+def extract_think_sections(text: str):
+    think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
+    if think_match:
+        think_content = think_match.group(1).strip()
+        end_pos = think_match.end()
+        post_think_content = text[end_pos:].strip()
+    else:
+        think_content = None
+        post_think_content = text.strip()
+        raise ValueError
+    return think_content, post_think_content
+def extract_think_and_solution_V2(text: str):
+    pattern = (
+        r"<\|begin_of_thought\|>(.*?)<\|end_of_thought\|>\s*"
+        r"<\|begin_of_solution\|>(.*?)<\|end_of_solution\|>"
+    )
+    match = re.search(pattern, text, re.DOTALL)
+    if match:
+        think_content = match.group(1).strip()
+        post_think_content = match.group(2).strip()
+    else:
+        think_content = None
+        post_think_content = text.strip()
+        raise ValueError("Missing required <|begin_of_thought|> or <|begin_of_solution|> blocks.")
+    return think_content, post_think_content
+# ---------- 参数设置 ----------
+train_size = 110000  # 训练集样本数量
+test_size = 1000    # 测试集样本数量
+max_prompt_token_len = 2048
+random_seed = 42
+num_workers = 16
+# ---------- 加载并打乱数据集 ----------
+datasets_config = {
+    "OpenR1-Math-220k": "/workspace/0407_nips/data_preprocess/OpenR1-Math-220k/data",
+    # "reasoning-v1-20m": "/workspace/0407_nips/data_preprocess/reasoning-v1-20m/data",
+    "OpenThoughts-114k-math": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-math/data",
+    # "OpenThoughts-114k-Code_decontaminated": "/workspace/0407_nips/data_preprocess/OpenThoughts-114k-Code_decontaminated/data",
+    # "Medical-R1-Distill-Data": "/workspace/0407_nips/data_preprocess/Medical-R1-Distill-Data"
+}
+for name, path in datasets_config.items():
+    print(f"{name}")
+    dataset = load_dataset(path)["train"]
+    # shuffled_dataset = dataset.shuffle(seed=42)
+    # dataset = load_dataset('/workspace/0407_nips/data_preprocess/reasoning-v1-20m/data')["train"]
+    total_size = train_size + test_size + 1000
+    dataset = dataset.shuffle(seed=random_seed).select(range(total_size))
+    # ---------- 初始化 Tokenizer ----------
+    tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
+    format_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
+    records = []
+    count = 0
+    # ---------- 处理样本 ----------
+    for item in tqdm(dataset, desc="Processing samples"):
+        if len(records) >= total_size:
+            break
+        if name == "OpenR1-Math-220k":
+            problem = item["problem"].strip()
+            response_full = item["generations"][0].strip()
+            reasoning_blocks, answer = extract_think_sections(response_full)
+            if not reasoning_blocks:
+                continue
+            reasoning = reasoning_blocks[0].strip()
+            solution = answer.strip()
+        elif name == "OpenThoughts-114k-math":
+            problem = item["problem"].strip()
+            response_full = item["conversations"][1]["value"].strip()
+            reasoning_blocks, answer = extract_think_and_solution_V2(response_full)
+            if not reasoning_blocks:
+                continue
+            reasoning = reasoning_blocks[0].strip()
+            solution = answer.strip()
+        elif name == "reasoning-v1-20m":
+            problem = item.get("prompt", "").strip()
+            response_full = item.get("response", "").strip()
+            reasoning_blocks, answer = extract_think_sections(response_full)
+            if not reasoning_blocks:
+                continue
+            reasoning = reasoning_blocks[0].strip()
+            solution = answer.strip()
+        input_token_count = len(tokenizer.tokenize(problem))
+        output_token_count = len(tokenizer.tokenize(solution))
+        reasoning_token_count = len(tokenizer.tokenize(reasoning))
+        instruct_info = (
+            "Your task is to estimate how many tokens a reasoning model would use to solve the following Problem and Solution.\n"
+            "Please return the predicted number of tokens in JSON format: ```json\n{\"count\": int}\n```\n\n"
+            f"<Problem>\n{problem}\n</Problem>\n\n"
+            f"<Solution>\n{solution}\n</Solution>\n"
+            f"The Problem has {input_token_count} tokens, and the Solution has {output_token_count} tokens.\n\n\n"
+            "Please provide a detailed chain-of-thought reasoning process and include your thought process within `<think>` tags. "
+            "Your final answer should be enclosed within `<answer>` tags.\n\n"
+            "Ensure that your counting logic is sound and that your explanation is token-efficient.\n\n"
+            "Example format:\n"
+            "<think> Step-by-step reasoning, including self-reflection and corrections if necessary. [Limited by 1024 tokens] </think>\n"
+            "<answer> Summary of the thought process leading to the final token count and your predicted token count in json format: ```json\n{\"count\": int}\n``` [Limited by 512 tokens]\n"
+            "</answer>\n\n"
+        )
+        cot_info = "Let me solve this step by step.\n"
+        messages = [
+            {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
+            {"role": "user", "content": instruct_info.strip()},
+        ]
+        prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+        prompt += cot_info
+        prompt_token_len = len(format_tokenizer.tokenize(prompt))
+        if prompt_token_len <= max_prompt_token_len - 10:
+            records.append({
+                "prompt": prompt,
+                "ground_truth": reasoning_token_count,
+                "data_source": name,
+                "ids": f"{name}_{count}",
+                "prompt_token_len": prompt_token_len,
+            })
+            count += 1
+        if len(records) >= total_size:
+            break
+    # ---------- 拆分训练集和测试集 ----------
+    train_records = records[:train_size]
+    test_records = records[train_size:train_size + test_size]
+    # ---------- 保存 ----------
+    df_train = pd.DataFrame(train_records)
+    df_test = pd.DataFrame(test_records)
+    df_train.to_parquet(f"train_counting_dataset_{name}_{train_size}.parquet", index=False)
+    df_test.to_parquet(f"test_counting_dataset_{name}_{test_size}.parquet", index=False)
+    print("✅ 成功生成训练和测试数据集")
+    print(f"Train samples: {len(df_train)}, Test samples: {len(df_test)}")

load_check.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import pandas as pd
+import pandas as pd
+# 显示所有列，不省略
+pd.set_option('display.max_columns', None)
+# 显示所有行（对 head() 没影响，但可用于 df 全体显示时）
+pd.set_option('display.max_rows', None)
+# 不截断列内容，显示完整字符串
+pd.set_option('display.max_colwidth', None)
+# 设置最大宽度，防止自动换行
+pd.set_option('display.width', 1000)
+# 读取 parquet 文件（自动识别使用的 engine，比如 pyarrow 或 fastparquet）
+df = pd.read_parquet("/workspace/0525_zyw/verl/counting/mk_data/v2_train_counting_dataset_OpenR1-Math-220k_90258.parquet", engine="auto")
+print(df.shape)
+# print(df.columns)
+# # 查看前几条数据
+# print(df.loc[0])  # 默认前 5 条，可以传入参数指定行数，比如 df.head(10)
+# for i, row in df.iterrows():
+#     print(f"Row {i}:\n{row['prompt']}\n---")
+#     if i > 1:  # 只打印前 5 行
+#         break

long_prompts.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ []

merge_train.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import pandas as pd
+import glob
+import random
+random_seed = 42
+sample_size = 15000
+# 1. 找到所有 parquet 文件
+parquet_files = glob.glob("v2_train_counting_dataset_*.parquet")
+selected_parquet_files = []
+for parquet_file in parquet_files:
+    if "v2_train_counting_dataset_OpenThoughts-114k-math_88120.parquet" not in parquet_file:
+        selected_parquet_files.append(parquet_file)
+print("找到的parquet文件：", selected_parquet_files)
+# 2. 合并所有数据
+all_data = []
+for file in selected_parquet_files:
+    print(file)
+    df = pd.read_parquet(file)
+    all_data.append(df)
+df_all = pd.concat(all_data, ignore_index=True)
+print("合并后总数据量：", len(df_all))
+# 3. 按 data_source 分组，每组采样 25k
+sampled_dfs = []
+for name, group in df_all.groupby("data_source"):
+    if len(group) > sample_size:
+        sampled = group.sample(n=sample_size, random_state=random_seed)
+    else:
+        sampled = group
+    sampled_dfs.append(sampled)
+    print(f"{name}: 原始{len(group)}条，采样{len(sampled)}条")
+# 4. 合并采样后的数据
+df_sampled = pd.concat(sampled_dfs, ignore_index=True)
+print("采样后总数据量：", len(df_sampled))
+shuffled_df = df_sampled.sample(frac=1, random_state=random_seed).reset_index(drop=True)
+# 5. 保存
+shuffled_df.to_parquet("merged_sampled_4datasets_15k_each.parquet", index=False)
+print("已保存到 merged_sampled_4datasets_15k_each.parquet")

v2_test_counting_dataset_Medical-R1-Distill-Data_1000.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89bc5389f4e66b52170d2e0c9922171cb7f63a679e4520ab99209bafda362819
+size 1079992

v2_test_counting_dataset_OpenR1-Math-220k_1000.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2454554feb69d07e233ef4a3f40f76d9bf537ed052e8bbc7416c420b4dc92b3d
+size 952195

v2_test_counting_dataset_OpenThoughts-114k-Code_decontaminated_1000.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:beda857c76c38d5a06916b487bafb2cffb76cb6fc31b6fe89698a6f54f8bcf2d
+size 2202832

v2_test_counting_dataset_OpenThoughts-114k-math_1000.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:90069eb9de435395355312e63597e014d3801b293e9bf5e2d9263ced6a67d19c
+size 998292

v2_test_counting_dataset_reasoning-v1-20m_1000.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b98f56c5005e65956ac406021f6e9a98744da8aaec748ddf4e7086970e3cdc2
+size 1665071

v2_train_counting_dataset_Medical-R1-Distill-Data_21000.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c1c02b93013d57b2dd8c874a13a86e2804b9e25931903945dd93bf0606312fb
+size 21995473

v2_train_counting_dataset_OpenR1-Math-220k_90258.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b940e63fcaed0d5eff15e40123e63b1dad99ccf9e37071c7d8a308e760e90db9
+size 84122607

v2_train_counting_dataset_OpenThoughts-114k-Code_decontaminated_15372.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:18a52d5aa00a9140ca4c0c20a03f8ec7edac9ffc62eb02036ecef0c0c7e12eb1
+size 33672180

v2_train_counting_dataset_OpenThoughts-114k-math_88120.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5a1b7889afc082e16c4e70524a67f99784b825e7a799cb9314ba1fae32688a2b
+size 85576595

v2_train_counting_dataset_reasoning-v1-20m_100982.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7808bc50c3f2edf5d4d80aac04cdfb13820d1b5f65d85e17ae6f00d680ac4466
+size 166027524