AudioBench-Leaderboard / process_log.py
He Yingxu
breakdown asr sea
2330259
import os
import re
import sys
import json
import random
import pandas as pd
import numpy as np
from app.content import *
data_to_df = []
log_dir = "path/to/audiobench/log"
all_evaluated_models = os.listdir(log_dir)
for model_name in all_evaluated_models:
if "geyu_whisper" in model_name:
continue
if "activation_checkpointing" in model_name:
continue
model_dir = os.path.join(log_dir, model_name)
if not os.path.isdir(model_dir):
continue
for log_file in os.listdir(model_dir):
if not log_file.endswith("score.json"):
continue
match = re.match("^(.*?)_(llama3_70b_judge|wer|bleu)_score.json$", log_file)
ds_name = match.group(1)
metrics = match.group(2)
eval_path = os.path.join(model_dir, log_file)
with open(eval_path, "r") as f:
eval_data = json.load(f)
if metrics == "llama3_70b_judge":
value = eval_data[metrics]["judge_score"]
elif metrics == "wer":
value = eval_data[metrics]
elif metrics == "bleu":
value = eval_data[metrics]
data_to_df.append([model_name, ds_name, metrics, value])
eval_result_df = pd.DataFrame(data_to_df, columns=["model", "dataset", "metrics", "value"])
eval_result_df["model"] = eval_result_df["model"].replace("MERaLiON_AudioLLM_v1_hf", "MERaLiON-AudioLLM-Whisper-SEA-LION")
# original results_organized
archive_results_dir = "results_organized_archive"
output_results_dir = "results_organized"
def merge_results(display_datasets, metrics, result_sub_path=None):
raw_ds_names = [displayname2datasetname[dis_name] for dis_name in display_datasets]
new_result = eval_result_df[eval_result_df["dataset"].isin(raw_ds_names) & (eval_result_df["metrics"] == metrics)]
new_result = new_result.drop(columns=["metrics"])
new_result = new_result.pivot(index="model", columns="dataset", values="value").reset_index()
new_result = new_result.rename(columns={"model": "Model"})
new_result = new_result.dropna(axis=0, how="any")
archive_result_path = os.path.join(archive_results_dir, result_sub_path)
if os.path.exists(archive_result_path):
archive_result = pd.read_csv(archive_result_path)
archive_columns = [col for col in archive_result.columns if col in raw_ds_names]
archive_result = archive_result[["Model"] + archive_columns]
combined_result = pd.concat([archive_result, new_result], ignore_index=True)
combined_result = combined_result.drop_duplicates(subset=["Model"], keep="last", ignore_index=True)
return new_result, combined_result
return new_result, new_result
result_file_mapper = {
"bleu/st.csv": speech_translation_datasets,
"llama3_70b_judge/accent_recognition.csv": ar_datasets,
"llama3_70b_judge/audio_captioning.csv": ac_datasets,
"llama3_70b_judge/audio_scene_question_answering.csv": asqa_datasets,
"llama3_70b_judge/emotion_recognition.csv": er_datasets,
"llama3_70b_judge/gender_recognition.csv": gr_datasets,
"llama3_70b_judge/music_understanding.csv": music_datasets,
"llama3_70b_judge/sds_singlish.csv": sds_datasets,
"llama3_70b_judge/speech_instruction.csv": si_datasets,
"llama3_70b_judge/sqa_english.csv": speech_qa_english_datasets,
"llama3_70b_judge/sqa_singlish.csv": speech_qa_singlish_datasets,
"llama3_70b_judge/under_development_llama3_70b_judge.csv": non_wer_development_datasets,
"meteor/audio_captioning.csv": ac_datasets,
"wer/asr_english.csv": asr_english_datasets,
"wer/asr_singlish.csv": asr_singlish_datasets,
"wer/asr_mandarin.csv": asr_mandarin_datasets,
"wer/asr_malay.csv": asr_malay_datasets,
"wer/asr_tamil.csv": asr_tamil_datasets,
"wer/asr_indonesian.csv": asr_indonesian_datasets,
"wer/asr_thai.csv": asr_thai_datasets,
"wer/asr_vietnamese.csv": asr_vietnamese_datasets,
"wer/asr_private.csv": asr_private_datasets,
"wer/under_development_wer.csv": wer_development_datasets,
}
for sub_path, display_ds in result_file_mapper.items():
metrics = sub_path.split("/")[0]
new_result, combined_result = merge_results(display_ds, metrics, sub_path)
output_path = os.path.join(output_results_dir, sub_path)
os.makedirs(os.path.dirname(output_path), exist_ok=True)
combined_result.to_csv(output_path, index=False)