I am trying to compute Thai finetasks score of llama 3.2 1B.
As shown in the web,

You can see that the score is 0.14. But When I normalized the score and compute the final result I got 0.296401, which is quite different.

Steps

According to this https://github.com/huggingface/lighteval/issues/509

Here is my code

def read_json(path: str):
    with open(path, "r") as f:
        r = json.load(f)

    # format column
    result = r["results"]
    result_tr = {}
    for task, item in result.items():
        if "lighteval|" in task:
            for k, v in item.items():
                new_key = f"{task}/{k}"
                result_tr[new_key] = v

    return result_tr


def get_dataframe_result(
    paths: str | list[str], separate: bool = False
) -> pd.DataFrame:
    if isinstance(paths, list):
        if separate:
            return pd.concat([pd.DataFrame([read_json(p)]) for p in paths], axis=1)
        else:
            return pd.concat([pd.DataFrame([read_json(p)]) for p in paths], axis=0)
    if isinstance(paths, str):
        return pd.DataFrame([read_json(paths)])

llama3_2 = glob.glob("../results/results/meta-llama/Llama-3.2-3B/*.json")
llama3_2_df = get_dataframe_result(paths=llama3_2, separate=True).reset_index(drop=True)
llama3_2_df["model"] = "llama3_2"

random_baseline = pd.read_csv("../lighteval_baseline.csv", index_col=None)
filter_baseline_col = []
for col in random_baseline.columns:
    task = col.split("/")[0]
    for c in df.columns:
        if task in c:
            filter_baseline_col.append(col)

filter_baseline_col = sorted(
    set(
        [
            c
            for c in filter_baseline_col
            if "lighteval|" in c
            and ("/acc_" in c or "/f1" in c)
            and ("stderr" not in c)
        ]
    )
)
random_baseline = random_baseline.loc[:, filter_baseline_col]
random_baseline.columns = [
    c.replace("norm", "") for c in random_baseline.columns
]

metric_cols_1 = [
    col
    for col in df.columns
    if "lighteval|" in col
    and ("/acc_" in col or "/f1" in col)
    and ("stderr" not in col)
]

# Get all metric columns (containing 'lighteval|')
metric_cols_2 = [
    col
    for col in df.columns
    if "lighteval|" in col
    and ":_average" not in col  # Exclude existing averages
    and ("/acc_" in col or "/f1" in col)
    and ("stderr" not in col)
]  # Include only metric columns


category_groups = defaultdict(list)
for col in metric_cols_2:
    task_name = col.split("|")[1].split("_")[0]
    if task_name == "meta":
        task_name = "meta_mmlu"
    if task_name == "community":
        task_name = "community_hellaswag"
    # print(task_name)
    category = get_task_category(task_name)
    # print(category)

    if category:
        category_groups[category].append(col)

# category_groups
random_baseline.columns = [
    f"{col}baseline" for col in random_baseline.columns
]

result_df_list = []
for model in tqdm(df["model"].unique()):

    # print(f"Processing model: {model}")

    model_df = df.loc[df["model"].eq(model)].copy().reset_index(drop=True)

    for col in metric_cols_1:
        baseline_col = f"{col}baseline"
        # print(baseline_col)
        # Skip if baseline is 0 or 1
        if (random_baseline[baseline_col] == 0).all() or (
            random_baseline[baseline_col] == 1
        ).all():
            continue

        # Rescale: (score - baseline) / (1 - baseline)
        model_df[col] = (model_df[col] - random_baseline[baseline_col]) / (
            1 - random_baseline[baseline_col]
        )

        # Calculate mean for each category
    for category, cols in category_groups.items():
        model_df[f"category_{category}"] = model_df[cols].mean(axis=1)

    category_cols = [col for col in model_df.columns if col.startswith("category_")]
    model_df["agg_score_macro"] = model_df[category_cols].mean(axis=1)
    result_df_list.append(model_df)

final_df = pd.concat(result_df_list, axis=0, ignore_index=True).sort_values(
    by="agg_score_macro", ascending=False
)
cols = ["model", "category_RC", "category_NLU", "category_GK", "agg_score_macro"]
final_df.loc[:, cols]

Note that

the df is a dataframe containing the results for my models. One model per row, which include llama 3.2 and others.
I had to evaluate llama3.2 for each dataset separately as there was an cuda error like in this https://github.com/huggingface/lighteval/issues/561

here is the command

export CUDA_VISIBLE_DEVICES="0,1"
echo "Running lighteval for model: meta-llama/Llama-3.2-3B"

lighteval accelerate \
"pretrained=meta-llama/Llama-3.2-3B,dtype=bfloat16,model_parallel=True" \
"lighteval|meta_mmlu_tha_mcf|5|1" \
--custom-tasks "src/lighteval/tasks/multilingual/tasks.py" \
--dataset-loading-processes 8 \
--cache-dir "./le_cache" \
--no-use-chat-template \
--override-batch-size 8

lighteval accelerate \
"pretrained=meta-llama/Llama-3.2-3B,dtype=bfloat16,model_parallel=True" \
"lighteval|m3exams_tha_mcf|5|1" \
--custom-tasks "src/lighteval/tasks/multilingual/tasks.py" \
--dataset-loading-processes 8 \
--cache-dir "./le_cache" \
--no-use-chat-template \
--override-batch-size 8

lighteval accelerate \
"pretrained=meta-llama/Llama-3.2-3B,dtype=bfloat16,model_parallel=True" \
"lighteval|belebele_tha_Thai_mcf|5|1" \
--custom-tasks "src/lighteval/tasks/multilingual/tasks.py" \
--dataset-loading-processes 8 \
--cache-dir "./le_cache" \
--no-use-chat-template \
--override-batch-size 8

lighteval accelerate \
"pretrained=meta-llama/Llama-3.2-3B,dtype=bfloat16,model_parallel=True" \
"lighteval|thaiqa_tha|5|1" \
--custom-tasks "src/lighteval/tasks/multilingual/tasks.py" \
--dataset-loading-processes 8 \
--cache-dir "./le_cache" \
--no-use-chat-template \
--override-batch-size 2

lighteval accelerate \
"pretrained=meta-llama/Llama-3.2-3B,dtype=bfloat16,model_parallel=True" \
"lighteval|xquad_tha|5|1" \
--custom-tasks "src/lighteval/tasks/multilingual/tasks.py" \
--dataset-loading-processes 8 \
--cache-dir "./le_cache" \
--no-use-chat-template \
--override-batch-size 8

lighteval accelerate \
"pretrained=meta-llama/Llama-3.2-3B,dtype=bfloat16,model_parallel=True" \
"lighteval|community_hellaswag_tha_mcf|5|1" \
--custom-tasks "src/lighteval/tasks/multilingual/tasks.py" \
--dataset-loading-processes 8 \
--cache-dir "./le_cache" \
--no-use-chat-template \
--override-batch-size 8

lighteval accelerate \
"pretrained=meta-llama/Llama-3.2-3B,dtype=bfloat16,model_parallel=True" \
"lighteval|xnli2.0_tha_mcf|5|1" \
--custom-tasks "src/lighteval/tasks/multilingual/tasks.py" \
--dataset-loading-processes 8 \
--cache-dir "./le_cache" \
--no-use-chat-template \
--override-batch-size 8

As I stated in issue https://github.com/huggingface/lighteval/issues/509

def rescale_scores(
    df: pd.DataFrame, baseline_runs: list[str], metric_columns: list[str]
) -> pd.DataFrame:
    """
    Rescales scores relative to a baseline performance.
    """
    df = df.copy()

    # Calculate mean baseline performance
    baseline_mask = df["runname"].isin(baseline_runs)
    baseline = df[baseline_mask].groupby("steps")[metric_columns].mean()

    # Reindex to match all steps and interpolate missing values
    baseline = baseline.reindex(df["steps"].unique()).interpolate()

    # Merge baseline scores with main df
    df_with_baseline = df.merge(
        baseline.reset_index(), on=["steps"], how="left", suffixes=("", "_baseline")
    ).fillna(0)

    # Rescale each metric column
    for col in metric_columns:
        baseline_col = f"{col}_baseline"
        # Skip if baseline is 0 or 1
        if (df_with_baseline[baseline_col] == 0).all() or (
            df_with_baseline[baseline_col] == 1
        ).all():
            continue

        # Rescale: (score - baseline) / (1 - baseline)
        df[col] = (df[col] - df_with_baseline[baseline_col]) / (
            1 - df_with_baseline[baseline_col]
        )

    return df

This function can not be used directly with the output of lighteval accelerate command as there is no runname and steps column. So I ommited these line

    # Calculate mean baseline performance
    baseline_mask = df["runname"].isin(baseline_runs)
    baseline = df[baseline_mask].groupby("steps")[metric_columns].mean()

    # Reindex to match all steps and interpolate missing values
    baseline = baseline.reindex(df["steps"].unique()).interpolate()

    # Merge baseline scores with main df
    df_with_baseline = df.merge(
        baseline.reset_index(), on=["steps"], how="left", suffixes=("", "_baseline")
    ).fillna(0)

and use only

# Rescale each metric column
    for col in metric_columns:
        baseline_col = f"{col}_baseline"
        # Skip if baseline is 0 or 1
        if (df_with_baseline[baseline_col] == 0).all() or (
            df_with_baseline[baseline_col] == 1
        ).all():
            continue

        # Rescale: (score - baseline) / (1 - baseline)
        df[col] = (df[col] - df_with_baseline[baseline_col]) / (
            1 - df_with_baseline[baseline_col]
        )

    return df

Now, I know that the random baseline can be generated via this command

lighteval baseline \
  "lighteval|meta_mmlu_tha_mcf|5|1, \
  lighteval|m3exams_tha_mcf|5|1, \
  lighteval|belebele_tha_Thai_mcf|5|1, \
  lighteval|thaiqa_tha|5|1, \
  lighteval|xquad_tha|5|1, \
  lighteval|community_hellaswag_tha_mcf|5|1, \
  lighteval|xnli2.0_tha_mcf|5|1" \
  --custom-tasks "src/lighteval/tasks/multilingual/tasks.py"

but in the above code, I used the one attached in https://github.com/huggingface/lighteval/issues/509

Result

After I ran the above code, I got this

I really want to compare the result with the leaderboard, but as documentation on how to compare these metrics seem to be not clear right now. This is as much as I can do.

Can someone help me with this, please?

Spaces:

HuggingFaceFW
/

blogpost-fine-tasks

Running

Try to compute Thai finetasks score of llama 3.2 1B

Steps

Result