Try to compute Thai finetasks score of llama 3.2 1B
Hi
I am trying to compute Thai finetasks score of llama 3.2 1B.
As shown in the web,
You can see that the score is 0.14. But When I normalized the score and compute the final result I got 0.296401, which is quite different.
Steps
According to this https://github.com/huggingface/lighteval/issues/509
Here is my code
def read_json(path: str):
with open(path, "r") as f:
r = json.load(f)
# format column
result = r["results"]
result_tr = {}
for task, item in result.items():
if "lighteval|" in task:
for k, v in item.items():
new_key = f"{task}/{k}"
result_tr[new_key] = v
return result_tr
def get_dataframe_result(
paths: str | list[str], separate: bool = False
) -> pd.DataFrame:
if isinstance(paths, list):
if separate:
return pd.concat([pd.DataFrame([read_json(p)]) for p in paths], axis=1)
else:
return pd.concat([pd.DataFrame([read_json(p)]) for p in paths], axis=0)
if isinstance(paths, str):
return pd.DataFrame([read_json(paths)])
llama3_2 = glob.glob("../results/results/meta-llama/Llama-3.2-3B/*.json")
llama3_2_df = get_dataframe_result(paths=llama3_2, separate=True).reset_index(drop=True)
llama3_2_df["model"] = "llama3_2"
random_baseline = pd.read_csv("../lighteval_baseline.csv", index_col=None)
filter_baseline_col = []
for col in random_baseline.columns:
task = col.split("/")[0]
for c in df.columns:
if task in c:
filter_baseline_col.append(col)
filter_baseline_col = sorted(
set(
[
c
for c in filter_baseline_col
if "lighteval|" in c
and ("/acc_" in c or "/f1" in c)
and ("stderr" not in c)
]
)
)
random_baseline = random_baseline.loc[:, filter_baseline_col]
random_baseline.columns = [
c.replace("norm", "") for c in random_baseline.columns
]
metric_cols_1 = [
col
for col in df.columns
if "lighteval|" in col
and ("/acc_" in col or "/f1" in col)
and ("stderr" not in col)
]
# Get all metric columns (containing 'lighteval|')
metric_cols_2 = [
col
for col in df.columns
if "lighteval|" in col
and ":_average" not in col # Exclude existing averages
and ("/acc_" in col or "/f1" in col)
and ("stderr" not in col)
] # Include only metric columns
category_groups = defaultdict(list)
for col in metric_cols_2:
task_name = col.split("|")[1].split("_")[0]
if task_name == "meta":
task_name = "meta_mmlu"
if task_name == "community":
task_name = "community_hellaswag"
# print(task_name)
category = get_task_category(task_name)
# print(category)
if category:
category_groups[category].append(col)
# category_groups
random_baseline.columns = [
f"{col}baseline" for col in random_baseline.columns
]
result_df_list = []
for model in tqdm(df["model"].unique()):
# print(f"Processing model: {model}")
model_df = df.loc[df["model"].eq(model)].copy().reset_index(drop=True)
for col in metric_cols_1:
baseline_col = f"{col}baseline"
# print(baseline_col)
# Skip if baseline is 0 or 1
if (random_baseline[baseline_col] == 0).all() or (
random_baseline[baseline_col] == 1
).all():
continue
# Rescale: (score - baseline) / (1 - baseline)
model_df[col] = (model_df[col] - random_baseline[baseline_col]) / (
1 - random_baseline[baseline_col]
)
# Calculate mean for each category
for category, cols in category_groups.items():
model_df[f"category_{category}"] = model_df[cols].mean(axis=1)
category_cols = [col for col in model_df.columns if col.startswith("category_")]
model_df["agg_score_macro"] = model_df[category_cols].mean(axis=1)
result_df_list.append(model_df)
final_df = pd.concat(result_df_list, axis=0, ignore_index=True).sort_values(
by="agg_score_macro", ascending=False
)
cols = ["model", "category_RC", "category_NLU", "category_GK", "agg_score_macro"]
final_df.loc[:, cols]
Note that
- the
df
is a dataframe containing the results for my models. One model per row, which include llama 3.2 and others. - I had to evaluate llama3.2 for each dataset separately as there was an cuda error like in this https://github.com/huggingface/lighteval/issues/561
here is the command
export CUDA_VISIBLE_DEVICES="0,1"
echo "Running lighteval for model: meta-llama/Llama-3.2-3B"
lighteval accelerate \
"pretrained=meta-llama/Llama-3.2-3B,dtype=bfloat16,model_parallel=True" \
"lighteval|meta_mmlu_tha_mcf|5|1" \
--custom-tasks "src/lighteval/tasks/multilingual/tasks.py" \
--dataset-loading-processes 8 \
--cache-dir "./le_cache" \
--no-use-chat-template \
--override-batch-size 8
lighteval accelerate \
"pretrained=meta-llama/Llama-3.2-3B,dtype=bfloat16,model_parallel=True" \
"lighteval|m3exams_tha_mcf|5|1" \
--custom-tasks "src/lighteval/tasks/multilingual/tasks.py" \
--dataset-loading-processes 8 \
--cache-dir "./le_cache" \
--no-use-chat-template \
--override-batch-size 8
lighteval accelerate \
"pretrained=meta-llama/Llama-3.2-3B,dtype=bfloat16,model_parallel=True" \
"lighteval|belebele_tha_Thai_mcf|5|1" \
--custom-tasks "src/lighteval/tasks/multilingual/tasks.py" \
--dataset-loading-processes 8 \
--cache-dir "./le_cache" \
--no-use-chat-template \
--override-batch-size 8
lighteval accelerate \
"pretrained=meta-llama/Llama-3.2-3B,dtype=bfloat16,model_parallel=True" \
"lighteval|thaiqa_tha|5|1" \
--custom-tasks "src/lighteval/tasks/multilingual/tasks.py" \
--dataset-loading-processes 8 \
--cache-dir "./le_cache" \
--no-use-chat-template \
--override-batch-size 2
lighteval accelerate \
"pretrained=meta-llama/Llama-3.2-3B,dtype=bfloat16,model_parallel=True" \
"lighteval|xquad_tha|5|1" \
--custom-tasks "src/lighteval/tasks/multilingual/tasks.py" \
--dataset-loading-processes 8 \
--cache-dir "./le_cache" \
--no-use-chat-template \
--override-batch-size 8
lighteval accelerate \
"pretrained=meta-llama/Llama-3.2-3B,dtype=bfloat16,model_parallel=True" \
"lighteval|community_hellaswag_tha_mcf|5|1" \
--custom-tasks "src/lighteval/tasks/multilingual/tasks.py" \
--dataset-loading-processes 8 \
--cache-dir "./le_cache" \
--no-use-chat-template \
--override-batch-size 8
lighteval accelerate \
"pretrained=meta-llama/Llama-3.2-3B,dtype=bfloat16,model_parallel=True" \
"lighteval|xnli2.0_tha_mcf|5|1" \
--custom-tasks "src/lighteval/tasks/multilingual/tasks.py" \
--dataset-loading-processes 8 \
--cache-dir "./le_cache" \
--no-use-chat-template \
--override-batch-size 8
- As I stated in issue https://github.com/huggingface/lighteval/issues/509
def rescale_scores(
df: pd.DataFrame, baseline_runs: list[str], metric_columns: list[str]
) -> pd.DataFrame:
"""
Rescales scores relative to a baseline performance.
"""
df = df.copy()
# Calculate mean baseline performance
baseline_mask = df["runname"].isin(baseline_runs)
baseline = df[baseline_mask].groupby("steps")[metric_columns].mean()
# Reindex to match all steps and interpolate missing values
baseline = baseline.reindex(df["steps"].unique()).interpolate()
# Merge baseline scores with main df
df_with_baseline = df.merge(
baseline.reset_index(), on=["steps"], how="left", suffixes=("", "_baseline")
).fillna(0)
# Rescale each metric column
for col in metric_columns:
baseline_col = f"{col}_baseline"
# Skip if baseline is 0 or 1
if (df_with_baseline[baseline_col] == 0).all() or (
df_with_baseline[baseline_col] == 1
).all():
continue
# Rescale: (score - baseline) / (1 - baseline)
df[col] = (df[col] - df_with_baseline[baseline_col]) / (
1 - df_with_baseline[baseline_col]
)
return df
This function can not be used directly with the output of lighteval accelerate
command as there is no runname and steps column. So I ommited these line
# Calculate mean baseline performance
baseline_mask = df["runname"].isin(baseline_runs)
baseline = df[baseline_mask].groupby("steps")[metric_columns].mean()
# Reindex to match all steps and interpolate missing values
baseline = baseline.reindex(df["steps"].unique()).interpolate()
# Merge baseline scores with main df
df_with_baseline = df.merge(
baseline.reset_index(), on=["steps"], how="left", suffixes=("", "_baseline")
).fillna(0)
and use only
# Rescale each metric column
for col in metric_columns:
baseline_col = f"{col}_baseline"
# Skip if baseline is 0 or 1
if (df_with_baseline[baseline_col] == 0).all() or (
df_with_baseline[baseline_col] == 1
).all():
continue
# Rescale: (score - baseline) / (1 - baseline)
df[col] = (df[col] - df_with_baseline[baseline_col]) / (
1 - df_with_baseline[baseline_col]
)
return df
- Now, I know that the random baseline can be generated via this command
lighteval baseline \
"lighteval|meta_mmlu_tha_mcf|5|1, \
lighteval|m3exams_tha_mcf|5|1, \
lighteval|belebele_tha_Thai_mcf|5|1, \
lighteval|thaiqa_tha|5|1, \
lighteval|xquad_tha|5|1, \
lighteval|community_hellaswag_tha_mcf|5|1, \
lighteval|xnli2.0_tha_mcf|5|1" \
--custom-tasks "src/lighteval/tasks/multilingual/tasks.py"
but in the above code, I used the one attached in https://github.com/huggingface/lighteval/issues/509
Result
After I ran the above code, I got this
I really want to compare the result with the leaderboard, but as documentation on how to compare these metrics seem to be not clear right now. This is as much as I can do.
Can someone help me with this, please?