Spaces:

MERaLiON
/

AudioBench-Leaderboard

Running

File size: 4,395 Bytes
import os
import re
import sys
import json
import random

import pandas as pd
import numpy as np

from app.content import *

data_to_df = []


log_dir = "path/to/audiobench/log"

all_evaluated_models = os.listdir(log_dir)
for model_name in all_evaluated_models:
    if "geyu_whisper" in model_name:
        continue

    if "activation_checkpointing" in model_name:
        continue

    model_dir = os.path.join(log_dir, model_name)

    if not os.path.isdir(model_dir):
        continue

    for log_file in os.listdir(model_dir):
        if not log_file.endswith("score.json"):
            continue

        match = re.match("^(.*?)_(llama3_70b_judge|wer|bleu)_score.json$", log_file)
        ds_name = match.group(1)
        metrics = match.group(2)

        eval_path = os.path.join(model_dir, log_file)

        with open(eval_path, "r") as f:
            eval_data = json.load(f)

        if metrics == "llama3_70b_judge":
            value = eval_data[metrics]["judge_score"]
        elif metrics == "wer":
            value = eval_data[metrics]
        elif metrics == "bleu":
            value = eval_data[metrics]
        
        data_to_df.append([model_name, ds_name, metrics, value])


eval_result_df = pd.DataFrame(data_to_df, columns=["model", "dataset", "metrics", "value"])
eval_result_df["model"] = eval_result_df["model"].replace("MERaLiON_AudioLLM_v1_hf", "MERaLiON-AudioLLM-Whisper-SEA-LION")

# original results_organized
archive_results_dir = "results_organized_archive"
output_results_dir = "results_organized"


def merge_results(display_datasets, metrics, result_sub_path=None):
    raw_ds_names = [displayname2datasetname[dis_name] for dis_name in display_datasets]

    new_result = eval_result_df[eval_result_df["dataset"].isin(raw_ds_names) & (eval_result_df["metrics"] == metrics)]
    new_result = new_result.drop(columns=["metrics"])
    new_result = new_result.pivot(index="model", columns="dataset", values="value").reset_index()
    new_result = new_result.rename(columns={"model": "Model"})
    new_result = new_result.dropna(axis=0, how="any")

    archive_result_path = os.path.join(archive_results_dir, result_sub_path)
    if os.path.exists(archive_result_path):
        archive_result = pd.read_csv(archive_result_path)
        archive_columns = [col for col in archive_result.columns if col in raw_ds_names]
        archive_result = archive_result[["Model"] + archive_columns]
        combined_result = pd.concat([archive_result, new_result], ignore_index=True)
        combined_result = combined_result.drop_duplicates(subset=["Model"], keep="last", ignore_index=True)

        return new_result, combined_result

    return new_result, new_result


result_file_mapper = {
    "bleu/st.csv": speech_translation_datasets,
    "llama3_70b_judge/accent_recognition.csv": ar_datasets,
    "llama3_70b_judge/audio_captioning.csv": ac_datasets,
    "llama3_70b_judge/audio_scene_question_answering.csv": asqa_datasets,
    "llama3_70b_judge/emotion_recognition.csv": er_datasets,
    "llama3_70b_judge/gender_recognition.csv": gr_datasets,
    "llama3_70b_judge/music_understanding.csv": music_datasets,
    "llama3_70b_judge/sds_singlish.csv": sds_datasets,
    "llama3_70b_judge/speech_instruction.csv": si_datasets,
    "llama3_70b_judge/sqa_english.csv": speech_qa_english_datasets,
    "llama3_70b_judge/sqa_singlish.csv": speech_qa_singlish_datasets,
    "llama3_70b_judge/under_development_llama3_70b_judge.csv": non_wer_development_datasets,
    "meteor/audio_captioning.csv": ac_datasets,
    "wer/asr_english.csv": asr_english_datasets,
    "wer/asr_singlish.csv": asr_singlish_datasets,
    "wer/asr_mandarin.csv": asr_mandarin_datasets,
    "wer/asr_malay.csv": asr_malay_datasets,
    "wer/asr_tamil.csv": asr_tamil_datasets,
    "wer/asr_indonesian.csv": asr_indonesian_datasets,
    "wer/asr_thai.csv": asr_thai_datasets,
    "wer/asr_vietnamese.csv": asr_vietnamese_datasets,
    "wer/asr_private.csv": asr_private_datasets,
    "wer/under_development_wer.csv": wer_development_datasets,
}


for sub_path, display_ds in result_file_mapper.items():
    metrics = sub_path.split("/")[0]
    new_result, combined_result = merge_results(display_ds, metrics, sub_path)

    output_path = os.path.join(output_results_dir, sub_path)
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    combined_result.to_csv(output_path, index=False)