|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Data processing used for analyzing and presenting the results""" |
|
|
|
import json |
|
import os |
|
|
|
import pandas as pd |
|
|
|
|
|
def preprocess(rows, task_name: str, print_fn=print): |
|
results = [] |
|
skipped = 0 |
|
for row in rows: |
|
run_info = row["run_info"] |
|
train_info = row["train_info"] |
|
meta_info = row["meta_info"] |
|
if run_info["peft_config"]: |
|
peft_type = run_info["peft_config"]["peft_type"] |
|
else: |
|
peft_type = "full-finetuning" |
|
if train_info["status"] != "success": |
|
skipped += 1 |
|
continue |
|
|
|
train_metrics = train_info["metrics"][-1] |
|
|
|
|
|
dct = { |
|
"task_name": task_name, |
|
"experiment_name": run_info["experiment_name"], |
|
"model_id": run_info["train_config"]["model_id"], |
|
"train_config": run_info["train_config"], |
|
"peft_type": peft_type, |
|
"peft_config": run_info["peft_config"], |
|
"cuda_memory_reserved_avg": train_info["cuda_memory_reserved_avg"], |
|
"cuda_memory_max": train_info["cuda_memory_max"], |
|
"cuda_memory_reserved_99th": train_info["cuda_memory_reserved_99th"], |
|
"total_time": run_info["total_time"], |
|
"train_time": train_info["train_time"], |
|
"file_size": train_info["file_size"], |
|
"test_accuracy": train_metrics["test accuracy"], |
|
"train_loss": train_metrics["train loss"], |
|
"train_samples": train_metrics["train samples"], |
|
"train_total_tokens": train_metrics["train total tokens"], |
|
"peft_version": meta_info["package_info"]["peft-version"], |
|
"peft_branch": run_info["peft_branch"], |
|
"transformers_version": meta_info["package_info"]["transformers-version"], |
|
"datasets_version": meta_info["package_info"]["datasets-version"], |
|
"torch_version": meta_info["package_info"]["torch-version"], |
|
"bitsandbytes_version": meta_info["package_info"]["bitsandbytes-version"], |
|
"package_info": meta_info["package_info"], |
|
"system_info": meta_info["system_info"], |
|
"created_at": run_info["created_at"], |
|
} |
|
results.append(dct) |
|
|
|
if skipped: |
|
print_fn(f"Skipped {skipped} of {len(rows)} entries because the train status != success") |
|
|
|
return results |
|
|
|
|
|
def load_jsons(path): |
|
results = [] |
|
for fn in os.listdir(path): |
|
if fn.endswith(".json"): |
|
with open(os.path.join(path, fn)) as f: |
|
row = json.load(f) |
|
results.append(row) |
|
return results |
|
|
|
|
|
def load_df(path, task_name, print_fn=print): |
|
jsons = load_jsons(path) |
|
preprocessed = preprocess(jsons, task_name=task_name, print_fn=print_fn) |
|
dtype_dict = { |
|
"task_name": "string", |
|
"experiment_name": "string", |
|
"model_id": "string", |
|
"train_config": "string", |
|
"peft_type": "string", |
|
"peft_config": "string", |
|
"cuda_memory_reserved_avg": int, |
|
"cuda_memory_max": int, |
|
"cuda_memory_reserved_99th": int, |
|
"total_time": float, |
|
"train_time": float, |
|
"file_size": int, |
|
"test_accuracy": float, |
|
"train_loss": float, |
|
"train_samples": int, |
|
"train_total_tokens": int, |
|
"peft_version": "string", |
|
"peft_branch": "string", |
|
"transformers_version": "string", |
|
"datasets_version": "string", |
|
"torch_version": "string", |
|
"bitsandbytes_version": "string", |
|
"package_info": "string", |
|
"system_info": "string", |
|
"created_at": "string", |
|
} |
|
df = pd.DataFrame(preprocessed) |
|
df = df.astype(dtype_dict) |
|
df["created_at"] = pd.to_datetime(df["created_at"]) |
|
|
|
df["train_time"] = df["train_time"].round().astype(int) |
|
df["total_time"] = df["total_time"].round().astype(int) |
|
|
|
|
|
important_columns = [ |
|
"experiment_name", |
|
"peft_type", |
|
"total_time", |
|
"train_time", |
|
"test_accuracy", |
|
"train_loss", |
|
"cuda_memory_max", |
|
"cuda_memory_reserved_99th", |
|
"cuda_memory_reserved_avg", |
|
"file_size", |
|
"created_at", |
|
"task_name", |
|
] |
|
other_columns = [col for col in df if col not in important_columns] |
|
df = df[important_columns + other_columns] |
|
|
|
size_before_drop_dups = len(df) |
|
columns = ["experiment_name", "model_id", "peft_type", "created_at"] |
|
|
|
df = df.sort_values("created_at").drop_duplicates(columns, keep="last") |
|
return df |
|
|