File size: 1,432 Bytes
98595da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import pandas as pd
import glob
import random

random_seed = 42
sample_size = 15000

# 1. 找到所有 parquet 文件
parquet_files = glob.glob("v2_train_counting_dataset_*.parquet")

selected_parquet_files = []
for parquet_file in parquet_files:
    if "v2_train_counting_dataset_OpenThoughts-114k-math_88120.parquet" not in parquet_file:
        selected_parquet_files.append(parquet_file)
print("找到的parquet文件:", selected_parquet_files)

# 2. 合并所有数据
all_data = []
for file in selected_parquet_files:
    print(file)
    df = pd.read_parquet(file)
    all_data.append(df)
df_all = pd.concat(all_data, ignore_index=True)
print("合并后总数据量:", len(df_all))

# 3. 按 data_source 分组,每组采样 25k
sampled_dfs = []
for name, group in df_all.groupby("data_source"):
    if len(group) > sample_size:
        sampled = group.sample(n=sample_size, random_state=random_seed)
    else:
        sampled = group
    sampled_dfs.append(sampled)
    print(f"{name}: 原始{len(group)}条,采样{len(sampled)}条")

# 4. 合并采样后的数据
df_sampled = pd.concat(sampled_dfs, ignore_index=True)
print("采样后总数据量:", len(df_sampled))

shuffled_df = df_sampled.sample(frac=1, random_state=random_seed).reset_index(drop=True)

# 5. 保存
shuffled_df.to_parquet("merged_sampled_4datasets_15k_each.parquet", index=False)
print("已保存到 merged_sampled_4datasets_15k_each.parquet")