In [1]:
import pandas as pd
from matplotlib.figure import Figure

df = pd.read_csv("../src_data/all-filters-big-runs.csv")
df

Unnamed: 0,runname,seed,steps,agg_score,commonsense_qa/acc,commonsense_qa/acc_norm,hellaswag/acc,hellaswag/acc_norm,openbookqa/acc,openbookqa/acc_norm,...,siqa/acc,siqa/acc_norm,winogrande/acc,winogrande/acc_norm,sciq/acc,sciq/acc_norm,arc/acc,arc/acc_norm,mmlu/acc,mmlu/acc_norm
0,big-run-sampled-fineweb-c4-filters,6,0,0.330893,0.186,0.233,0.272,0.258,0.166,0.286,...,0.367,0.362,0.516,0.497,0.208,0.202,0.2195,0.2510,0.230294,0.250147
1,big-run-sampled-fineweb-c4-filters,6,1000,0.359303,0.250,0.263,0.293,0.285,0.140,0.276,...,0.376,0.401,0.497,0.479,0.594,0.524,0.2740,0.2985,0.241617,0.251920
2,big-run-sampled-fineweb-c4-filters,6,2000,0.375393,0.268,0.277,0.319,0.324,0.150,0.274,...,0.372,0.411,0.507,0.484,0.688,0.606,0.3015,0.3270,0.246577,0.259146
3,big-run-sampled-fineweb-c4-filters,6,3000,0.389655,0.303,0.305,0.324,0.358,0.152,0.280,...,0.383,0.389,0.520,0.506,0.741,0.647,0.3395,0.3405,0.255001,0.268740
4,big-run-sampled-fineweb-c4-filters,6,4000,0.401195,0.309,0.310,0.353,0.393,0.138,0.288,...,0.378,0.402,0.534,0.511,0.766,0.652,0.3395,0.3495,0.256203,0.269056
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,big-run-sampled_full_filtered_no_dedup,6,163000,0.466255,0.426,0.372,0.469,0.555,0.242,0.354,...,0.389,0.394,0.563,0.544,0.869,0.808,0.4460,0.4435,0.297125,0.317543
668,big-run-sampled_full_filtered_no_dedup,6,164000,0.469743,0.431,0.376,0.467,0.556,0.232,0.356,...,0.391,0.397,0.568,0.552,0.861,0.800,0.4450,0.4515,0.302706,0.318447
669,big-run-sampled_full_filtered_no_dedup,6,165000,0.469847,0.426,0.375,0.472,0.549,0.234,0.364,...,0.389,0.401,0.562,0.548,0.867,0.795,0.4435,0.4475,0.297586,0.319279
670,big-run-sampled_full_filtered_no_dedup,6,166000,0.467651,0.423,0.365,0.470,0.555,0.226,0.356,...,0.392,0.399,0.564,0.545,0.872,0.812,0.4365,0.4475,0.297256,0.319704


In [2]:
pd.unique(df["runname"]).tolist()

['big-run-sampled-fineweb-c4-filters',
 'big-run-sampled_full_ind_minhash',
 'big-run-fineweb-v1-all-dumps',
 'big-run-sampled_full_filtered_no_dedup']

In [3]:
runs_mapping = {
    # "big-run-refinedweb": "RefinedWeb",
    # "big-run-c4": "C4",
    "big-run-sampled_full_filtered_no_dedup": "FineWeb: base filtering only",
    "big-run-sampled_full_ind_minhash": "FineWeb: independent MinHash (id mh)",
    "big-run-sampled-fineweb-c4-filters": "FineWeb: id mh + C4 filters",
    "big-run-fineweb-v1-all-dumps": "FineWeb: id mh + C4 + custom filters",
}

In [6]:
from matplotlib import pyplot as plt
import os
import json

metrics = ['agg_score', 'commonsense_qa/acc_norm', 'hellaswag/acc_norm', 'openbookqa/acc_norm', 'piqa/acc_norm',
                   'siqa/acc_norm', 'winogrande/acc_norm', 'arc/acc_norm', 'mmlu/acc_norm']

def normalize_runname(runname):
    return runname.replace("/", "_")

grouped = (
    df.groupby(["runname", "steps"])
    .agg(
        {
            key: "mean" for key in metrics
        }
    )
    .reset_index()
)

file_id="../assets/data/plots/all_filtering_steps"
files = {}
for metric in metrics:
    datas = {}
    for name, group in grouped.groupby("runname"):
        group = group[["steps", metric]].sort_values(by="steps")
        group = group.set_index("steps")
        rolling_avg = group
        # rolling_avg = group.rolling(window=5).mean()
        datas[name] = {
            "x": (rolling_avg.index * 2048 * 1024 * 1e-9).tolist(),
            "y": rolling_avg[metric].tolist(),
            "label": runs_mapping[name],
        }
    # Sort the datata based on the steps
    datas = {k: v for k, v in sorted(datas.items(), key=lambda x: -x[1]["y"][-1])}
    # Create a folder
    os.makedirs(f"{file_id}", exist_ok=True)
    with open(f"{file_id}/{normalize_runname(metric)}.json", "w") as f:
        json.dump({
            "data": datas,
            "layout": {
                "title": {
                    "text": "The different FineWeb processing steps"
                },
            }
        }, f)
    files[metric] = {"file": f"{normalize_runname(metric)}.json"}
# Create l
with open(f"{file_id}/index.json", "w") as f:
    json.dump({
        "files": files,
        "settings": {
            "defaultMetric": "agg_score",
            "slider":{"min":0,"max":30,"default":5}
        }
    }, f)
    