File size: 4,395 Bytes
2330259
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import re
import sys
import json
import random

import pandas as pd
import numpy as np

from app.content import *

data_to_df = []


log_dir = "path/to/audiobench/log"

all_evaluated_models = os.listdir(log_dir)
for model_name in all_evaluated_models:
    if "geyu_whisper" in model_name:
        continue

    if "activation_checkpointing" in model_name:
        continue

    model_dir = os.path.join(log_dir, model_name)

    if not os.path.isdir(model_dir):
        continue

    for log_file in os.listdir(model_dir):
        if not log_file.endswith("score.json"):
            continue

        match = re.match("^(.*?)_(llama3_70b_judge|wer|bleu)_score.json$", log_file)
        ds_name = match.group(1)
        metrics = match.group(2)

        eval_path = os.path.join(model_dir, log_file)

        with open(eval_path, "r") as f:
            eval_data = json.load(f)

        if metrics == "llama3_70b_judge":
            value = eval_data[metrics]["judge_score"]
        elif metrics == "wer":
            value = eval_data[metrics]
        elif metrics == "bleu":
            value = eval_data[metrics]
        
        data_to_df.append([model_name, ds_name, metrics, value])


eval_result_df = pd.DataFrame(data_to_df, columns=["model", "dataset", "metrics", "value"])
eval_result_df["model"] = eval_result_df["model"].replace("MERaLiON_AudioLLM_v1_hf", "MERaLiON-AudioLLM-Whisper-SEA-LION")

# original results_organized
archive_results_dir = "results_organized_archive"
output_results_dir = "results_organized"


def merge_results(display_datasets, metrics, result_sub_path=None):
    raw_ds_names = [displayname2datasetname[dis_name] for dis_name in display_datasets]

    new_result = eval_result_df[eval_result_df["dataset"].isin(raw_ds_names) & (eval_result_df["metrics"] == metrics)]
    new_result = new_result.drop(columns=["metrics"])
    new_result = new_result.pivot(index="model", columns="dataset", values="value").reset_index()
    new_result = new_result.rename(columns={"model": "Model"})
    new_result = new_result.dropna(axis=0, how="any")

    archive_result_path = os.path.join(archive_results_dir, result_sub_path)
    if os.path.exists(archive_result_path):
        archive_result = pd.read_csv(archive_result_path)
        archive_columns = [col for col in archive_result.columns if col in raw_ds_names]
        archive_result = archive_result[["Model"] + archive_columns]
        combined_result = pd.concat([archive_result, new_result], ignore_index=True)
        combined_result = combined_result.drop_duplicates(subset=["Model"], keep="last", ignore_index=True)

        return new_result, combined_result

    return new_result, new_result


result_file_mapper = {
    "bleu/st.csv": speech_translation_datasets,
    "llama3_70b_judge/accent_recognition.csv": ar_datasets,
    "llama3_70b_judge/audio_captioning.csv": ac_datasets,
    "llama3_70b_judge/audio_scene_question_answering.csv": asqa_datasets,
    "llama3_70b_judge/emotion_recognition.csv": er_datasets,
    "llama3_70b_judge/gender_recognition.csv": gr_datasets,
    "llama3_70b_judge/music_understanding.csv": music_datasets,
    "llama3_70b_judge/sds_singlish.csv": sds_datasets,
    "llama3_70b_judge/speech_instruction.csv": si_datasets,
    "llama3_70b_judge/sqa_english.csv": speech_qa_english_datasets,
    "llama3_70b_judge/sqa_singlish.csv": speech_qa_singlish_datasets,
    "llama3_70b_judge/under_development_llama3_70b_judge.csv": non_wer_development_datasets,
    "meteor/audio_captioning.csv": ac_datasets,
    "wer/asr_english.csv": asr_english_datasets,
    "wer/asr_singlish.csv": asr_singlish_datasets,
    "wer/asr_mandarin.csv": asr_mandarin_datasets,
    "wer/asr_malay.csv": asr_malay_datasets,
    "wer/asr_tamil.csv": asr_tamil_datasets,
    "wer/asr_indonesian.csv": asr_indonesian_datasets,
    "wer/asr_thai.csv": asr_thai_datasets,
    "wer/asr_vietnamese.csv": asr_vietnamese_datasets,
    "wer/asr_private.csv": asr_private_datasets,
    "wer/under_development_wer.csv": wer_development_datasets,
}


for sub_path, display_ds in result_file_mapper.items():
    metrics = sub_path.split("/")[0]
    new_result, combined_result = merge_results(display_ds, metrics, sub_path)

    output_path = os.path.join(output_results_dir, sub_path)
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    combined_result.to_csv(output_path, index=False)