PEFT-method-comparison / processing.py
github-actions[bot]
🚀 Deploy method comparison app from GH action
a76607e
# Copyright 2025-present the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Data processing used for analyzing and presenting the results"""
import json
import os
import pandas as pd
def preprocess(rows, task_name: str, print_fn=print):
results = []
skipped = 0
for row in rows:
run_info = row["run_info"]
train_info = row["train_info"]
meta_info = row["meta_info"]
if run_info["peft_config"]:
peft_type = run_info["peft_config"]["peft_type"]
else:
peft_type = "full-finetuning"
if train_info["status"] != "success":
skipped += 1
continue
train_metrics = train_info["metrics"][-1]
# extract the fields that make most sense
dct = {
"task_name": task_name,
"experiment_name": run_info["experiment_name"],
"model_id": run_info["train_config"]["model_id"],
"train_config": run_info["train_config"],
"peft_type": peft_type,
"peft_config": run_info["peft_config"],
"cuda_memory_reserved_avg": train_info["cuda_memory_reserved_avg"],
"cuda_memory_max": train_info["cuda_memory_max"],
"cuda_memory_reserved_99th": train_info["cuda_memory_reserved_99th"],
"total_time": run_info["total_time"],
"train_time": train_info["train_time"],
"file_size": train_info["file_size"],
"test_accuracy": train_metrics["test accuracy"],
"train_loss": train_metrics["train loss"],
"train_samples": train_metrics["train samples"],
"train_total_tokens": train_metrics["train total tokens"],
"peft_version": meta_info["package_info"]["peft-version"],
"peft_branch": run_info["peft_branch"],
"transformers_version": meta_info["package_info"]["transformers-version"],
"datasets_version": meta_info["package_info"]["datasets-version"],
"torch_version": meta_info["package_info"]["torch-version"],
"bitsandbytes_version": meta_info["package_info"]["bitsandbytes-version"],
"package_info": meta_info["package_info"],
"system_info": meta_info["system_info"],
"created_at": run_info["created_at"],
}
results.append(dct)
if skipped:
print_fn(f"Skipped {skipped} of {len(rows)} entries because the train status != success")
return results
def load_jsons(path):
results = []
for fn in os.listdir(path):
if fn.endswith(".json"):
with open(os.path.join(path, fn)) as f:
row = json.load(f)
results.append(row)
return results
def load_df(path, task_name, print_fn=print):
jsons = load_jsons(path)
preprocessed = preprocess(jsons, task_name=task_name, print_fn=print_fn)
dtype_dict = {
"task_name": "string",
"experiment_name": "string",
"model_id": "string",
"train_config": "string",
"peft_type": "string",
"peft_config": "string",
"cuda_memory_reserved_avg": int,
"cuda_memory_max": int,
"cuda_memory_reserved_99th": int,
"total_time": float,
"train_time": float,
"file_size": int,
"test_accuracy": float,
"train_loss": float,
"train_samples": int,
"train_total_tokens": int,
"peft_version": "string",
"peft_branch": "string",
"transformers_version": "string",
"datasets_version": "string",
"torch_version": "string",
"bitsandbytes_version": "string",
"package_info": "string",
"system_info": "string",
"created_at": "string",
}
df = pd.DataFrame(preprocessed)
df = df.astype(dtype_dict)
df["created_at"] = pd.to_datetime(df["created_at"])
# round training time to nearest second
df["train_time"] = df["train_time"].round().astype(int)
df["total_time"] = df["total_time"].round().astype(int)
# reorder columns for better viewing, pinned_columns arg in Gradio seems not to work correctly
important_columns = [
"experiment_name",
"peft_type",
"total_time",
"train_time",
"test_accuracy",
"train_loss",
"cuda_memory_max",
"cuda_memory_reserved_99th",
"cuda_memory_reserved_avg",
"file_size",
"created_at",
"task_name",
]
other_columns = [col for col in df if col not in important_columns]
df = df[important_columns + other_columns]
size_before_drop_dups = len(df)
columns = ["experiment_name", "model_id", "peft_type", "created_at"]
# we want to keep only the most recent run for each experiment
df = df.sort_values("created_at").drop_duplicates(columns, keep="last")
return df