import os import re import sys import json import random import pandas as pd import numpy as np from app.content import * data_to_df = [] log_dir = "path/to/audiobench/log" all_evaluated_models = os.listdir(log_dir) for model_name in all_evaluated_models: if "geyu_whisper" in model_name: continue if "activation_checkpointing" in model_name: continue model_dir = os.path.join(log_dir, model_name) if not os.path.isdir(model_dir): continue for log_file in os.listdir(model_dir): if not log_file.endswith("score.json"): continue match = re.match("^(.*?)_(llama3_70b_judge|wer|bleu)_score.json$", log_file) ds_name = match.group(1) metrics = match.group(2) eval_path = os.path.join(model_dir, log_file) with open(eval_path, "r") as f: eval_data = json.load(f) if metrics == "llama3_70b_judge": value = eval_data[metrics]["judge_score"] elif metrics == "wer": value = eval_data[metrics] elif metrics == "bleu": value = eval_data[metrics] data_to_df.append([model_name, ds_name, metrics, value]) eval_result_df = pd.DataFrame(data_to_df, columns=["model", "dataset", "metrics", "value"]) eval_result_df["model"] = eval_result_df["model"].replace("MERaLiON_AudioLLM_v1_hf", "MERaLiON-AudioLLM-Whisper-SEA-LION") # original results_organized archive_results_dir = "results_organized_archive" output_results_dir = "results_organized" def merge_results(display_datasets, metrics, result_sub_path=None): raw_ds_names = [displayname2datasetname[dis_name] for dis_name in display_datasets] new_result = eval_result_df[eval_result_df["dataset"].isin(raw_ds_names) & (eval_result_df["metrics"] == metrics)] new_result = new_result.drop(columns=["metrics"]) new_result = new_result.pivot(index="model", columns="dataset", values="value").reset_index() new_result = new_result.rename(columns={"model": "Model"}) new_result = new_result.dropna(axis=0, how="any") archive_result_path = os.path.join(archive_results_dir, result_sub_path) if os.path.exists(archive_result_path): archive_result = pd.read_csv(archive_result_path) archive_columns = [col for col in archive_result.columns if col in raw_ds_names] archive_result = archive_result[["Model"] + archive_columns] combined_result = pd.concat([archive_result, new_result], ignore_index=True) combined_result = combined_result.drop_duplicates(subset=["Model"], keep="last", ignore_index=True) return new_result, combined_result return new_result, new_result result_file_mapper = { "bleu/st.csv": speech_translation_datasets, "llama3_70b_judge/accent_recognition.csv": ar_datasets, "llama3_70b_judge/audio_captioning.csv": ac_datasets, "llama3_70b_judge/audio_scene_question_answering.csv": asqa_datasets, "llama3_70b_judge/emotion_recognition.csv": er_datasets, "llama3_70b_judge/gender_recognition.csv": gr_datasets, "llama3_70b_judge/music_understanding.csv": music_datasets, "llama3_70b_judge/sds_singlish.csv": sds_datasets, "llama3_70b_judge/speech_instruction.csv": si_datasets, "llama3_70b_judge/sqa_english.csv": speech_qa_english_datasets, "llama3_70b_judge/sqa_singlish.csv": speech_qa_singlish_datasets, "llama3_70b_judge/under_development_llama3_70b_judge.csv": non_wer_development_datasets, "meteor/audio_captioning.csv": ac_datasets, "wer/asr_english.csv": asr_english_datasets, "wer/asr_singlish.csv": asr_singlish_datasets, "wer/asr_mandarin.csv": asr_mandarin_datasets, "wer/asr_malay.csv": asr_malay_datasets, "wer/asr_tamil.csv": asr_tamil_datasets, "wer/asr_indonesian.csv": asr_indonesian_datasets, "wer/asr_thai.csv": asr_thai_datasets, "wer/asr_vietnamese.csv": asr_vietnamese_datasets, "wer/asr_private.csv": asr_private_datasets, "wer/under_development_wer.csv": wer_development_datasets, } for sub_path, display_ds in result_file_mapper.items(): metrics = sub_path.split("/")[0] new_result, combined_result = merge_results(display_ds, metrics, sub_path) output_path = os.path.join(output_results_dir, sub_path) os.makedirs(os.path.dirname(output_path), exist_ok=True) combined_result.to_csv(output_path, index=False)