He Yingxu
commited on
Commit
·
2330259
1
Parent(s):
b46797f
breakdown asr sea
Browse files- app.py +11 -3
- app/content.py +25 -10
- app/pages.py +85 -5
- process_log.py +116 -0
- results_organized/bleu/st.csv +2 -2
- results_organized/llama3_70b_judge/accent_recognition.csv +9 -9
- results_organized/llama3_70b_judge/sqa_english.csv +2 -0
- results_organized/wer/asr_english.csv +3 -3
- results_organized/wer/asr_indonesian.csv +13 -0
- results_organized/wer/asr_malay.csv +13 -0
- results_organized/wer/asr_mandarin.csv +18 -18
- results_organized/wer/asr_private.csv +13 -12
- results_organized/wer/asr_sea.csv +0 -12
- results_organized/wer/asr_singlish.csv +3 -3
- results_organized/wer/asr_tamil.csv +13 -0
- results_organized/wer/asr_thai.csv +13 -0
- results_organized/wer/asr_vietnamese.csv +13 -0
- results_organized_archive/bleu/st.csv +12 -0
- results_organized_archive/llama3_70b_judge/accent_recognition.csv +12 -0
- results_organized_archive/llama3_70b_judge/audio_captioning.csv +12 -0
- results_organized_archive/llama3_70b_judge/audio_scene_question_answering.csv +12 -0
- results_organized_archive/llama3_70b_judge/emotion_recognition.csv +12 -0
- results_organized_archive/llama3_70b_judge/gender_recognition.csv +12 -0
- results_organized_archive/llama3_70b_judge/music_understanding.csv +12 -0
- results_organized_archive/llama3_70b_judge/sds_singlish.csv +12 -0
- results_organized_archive/llama3_70b_judge/speech_instruction.csv +12 -0
- results_organized_archive/llama3_70b_judge/sqa_english.csv +12 -0
- results_organized_archive/llama3_70b_judge/sqa_singlish.csv +12 -0
- results_organized_archive/llama3_70b_judge/under_development_llama3_70b_judge.csv +12 -0
- results_organized_archive/meteor/audio_captioning.csv +12 -0
- results_organized_archive/wer/asr_english.csv +12 -0
- results_organized_archive/wer/asr_mandarin.csv +12 -0
- results_organized_archive/wer/asr_singlish.csv +12 -0
- results_organized_archive/wer/under_development_wer.csv +14 -0
app.py
CHANGED
@@ -19,7 +19,11 @@ pages = {
|
|
19 |
'ASR-English' : asr_english,
|
20 |
'ASR-Mandarin' : asr_mandarin,
|
21 |
'ASR-Singlish' : asr_singlish,
|
22 |
-
'ASR-
|
|
|
|
|
|
|
|
|
23 |
'ASR-Private' : asr_private,
|
24 |
'Speech Translation' : speech_translation,
|
25 |
'SQA-English' : speech_question_answering_english,
|
@@ -47,9 +51,13 @@ menu_items = [
|
|
47 |
sac.MenuItem(label='Automatic Speech Recognition', icon='mic',
|
48 |
children = [
|
49 |
sac.MenuItem(label='ASR-English', icon='mic'),
|
50 |
-
sac.MenuItem(label='ASR-Mandarin', icon='mic'),
|
51 |
sac.MenuItem(label='ASR-Singlish', icon='mic'),
|
52 |
-
sac.MenuItem(label='ASR-
|
|
|
|
|
|
|
|
|
|
|
53 |
sac.MenuItem(label='ASR-Private', icon='mic'),
|
54 |
]
|
55 |
),
|
|
|
19 |
'ASR-English' : asr_english,
|
20 |
'ASR-Mandarin' : asr_mandarin,
|
21 |
'ASR-Singlish' : asr_singlish,
|
22 |
+
'ASR-Malay' : asr_malay,
|
23 |
+
'ASR-Tamil' : asr_tamil,
|
24 |
+
'ASR-Indonesian' : asr_indonesian,
|
25 |
+
'ASR-Thai' : asr_thai,
|
26 |
+
'ASR-Vietnamese' : asr_vietnamese,
|
27 |
'ASR-Private' : asr_private,
|
28 |
'Speech Translation' : speech_translation,
|
29 |
'SQA-English' : speech_question_answering_english,
|
|
|
51 |
sac.MenuItem(label='Automatic Speech Recognition', icon='mic',
|
52 |
children = [
|
53 |
sac.MenuItem(label='ASR-English', icon='mic'),
|
|
|
54 |
sac.MenuItem(label='ASR-Singlish', icon='mic'),
|
55 |
+
sac.MenuItem(label='ASR-Mandarin', icon='mic'),
|
56 |
+
sac.MenuItem(label='ASR-Malay', icon='mic'),
|
57 |
+
sac.MenuItem(label='ASR-Tamil', icon='mic'),
|
58 |
+
sac.MenuItem(label='ASR-Indonesian', icon='mic'),
|
59 |
+
sac.MenuItem(label='ASR-Thai', icon='mic'),
|
60 |
+
sac.MenuItem(label='ASR-Vietnamese', icon='mic'),
|
61 |
sac.MenuItem(label='ASR-Private', icon='mic'),
|
62 |
]
|
63 |
),
|
app/content.py
CHANGED
@@ -23,23 +23,41 @@ asr_singlish_datasets = [
|
|
23 |
|
24 |
asr_mandarin_datasets = [
|
25 |
'AISHELL-ASR-ZH',
|
26 |
-
'CommonVoice-ZH'
|
|
|
27 |
]
|
28 |
|
29 |
|
30 |
-
|
31 |
-
'
|
|
|
|
|
|
|
|
|
32 |
'CommonVoice-17-Tamil',
|
33 |
-
|
34 |
-
'
|
|
|
|
|
|
|
|
|
|
|
35 |
'GigaSpeech-2-Indonesain',
|
|
|
|
|
|
|
|
|
36 |
'GigaSpeech-2-Thai',
|
37 |
-
'GigaSpeech-2-Vietnamese',
|
38 |
-
'Fleurs-Tamil',
|
39 |
'Lotus-Thai'
|
40 |
]
|
41 |
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
asr_private_datasets = [
|
44 |
'CNA',
|
45 |
'IDPC',
|
@@ -52,9 +70,6 @@ asr_private_datasets = [
|
|
52 |
'Mediacorp-Short',
|
53 |
'YouTube ASR: English Singapore Content',
|
54 |
'YouTube ASR: English with Strong Emotion',
|
55 |
-
'YouTube ASR: Malay with English Prompt',
|
56 |
-
'YouTube ASR: Chinese with English Prompt',
|
57 |
-
'YouTube ASR: Tamil with English Prompt'
|
58 |
]
|
59 |
|
60 |
|
|
|
23 |
|
24 |
asr_mandarin_datasets = [
|
25 |
'AISHELL-ASR-ZH',
|
26 |
+
'CommonVoice-ZH',
|
27 |
+
'YouTube ASR: Chinese with English Prompt',
|
28 |
]
|
29 |
|
30 |
|
31 |
+
asr_malay_datasets = [
|
32 |
+
'YouTube ASR: Malay with English Prompt'
|
33 |
+
]
|
34 |
+
|
35 |
+
|
36 |
+
asr_tamil_datasets = [
|
37 |
'CommonVoice-17-Tamil',
|
38 |
+
'Fleurs-Tamil',
|
39 |
+
'YouTube ASR: Tamil with English Prompt'
|
40 |
+
]
|
41 |
+
|
42 |
+
|
43 |
+
asr_indonesian_datasets = [
|
44 |
+
'CommonVoice-17-Indonesian',
|
45 |
'GigaSpeech-2-Indonesain',
|
46 |
+
]
|
47 |
+
|
48 |
+
|
49 |
+
asr_thai_datasets = [
|
50 |
'GigaSpeech-2-Thai',
|
|
|
|
|
51 |
'Lotus-Thai'
|
52 |
]
|
53 |
|
54 |
|
55 |
+
asr_vietnamese_datasets = [
|
56 |
+
'CommonVoice-17-Vietnamese',
|
57 |
+
'GigaSpeech-2-Vietnamese'
|
58 |
+
]
|
59 |
+
|
60 |
+
|
61 |
asr_private_datasets = [
|
62 |
'CNA',
|
63 |
'IDPC',
|
|
|
70 |
'Mediacorp-Short',
|
71 |
'YouTube ASR: English Singapore Content',
|
72 |
'YouTube ASR: English with Strong Emotion',
|
|
|
|
|
|
|
73 |
]
|
74 |
|
75 |
|
app/pages.py
CHANGED
@@ -180,12 +180,12 @@ def asr_mandarin():
|
|
180 |
draw('su', 'asr_mandarin', filter_1, 'wer')
|
181 |
|
182 |
|
183 |
-
def
|
184 |
-
st.title("Task: Automatic Speech Recognition -
|
185 |
|
186 |
sum = ['Overall']
|
187 |
|
188 |
-
filters_levelone = sum +
|
189 |
|
190 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
191 |
|
@@ -194,10 +194,90 @@ def asr_sea():
|
|
194 |
|
195 |
if filter_1:
|
196 |
if filter_1 in sum:
|
197 |
-
sum_table_mulit_metrix('
|
198 |
else:
|
199 |
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
|
200 |
-
draw('su', '
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
|
202 |
|
203 |
def asr_private():
|
|
|
180 |
draw('su', 'asr_mandarin', filter_1, 'wer')
|
181 |
|
182 |
|
183 |
+
def asr_malay():
|
184 |
+
st.title("Task: Automatic Speech Recognition - Malay")
|
185 |
|
186 |
sum = ['Overall']
|
187 |
|
188 |
+
filters_levelone = sum + asr_malay_datasets
|
189 |
|
190 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
191 |
|
|
|
194 |
|
195 |
if filter_1:
|
196 |
if filter_1 in sum:
|
197 |
+
sum_table_mulit_metrix('asr_malay', ['wer'])
|
198 |
else:
|
199 |
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
|
200 |
+
draw('su', 'asr_malay', filter_1, 'wer')
|
201 |
+
|
202 |
+
|
203 |
+
def asr_tamil():
|
204 |
+
st.title("Task: Automatic Speech Recognition - Tamil")
|
205 |
+
|
206 |
+
sum = ['Overall']
|
207 |
+
|
208 |
+
filters_levelone = sum + asr_tamil_datasets
|
209 |
+
|
210 |
+
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
211 |
+
|
212 |
+
with left:
|
213 |
+
filter_1 = st.selectbox('Dataset', filters_levelone)
|
214 |
+
|
215 |
+
if filter_1:
|
216 |
+
if filter_1 in sum:
|
217 |
+
sum_table_mulit_metrix('asr_tamil', ['wer'])
|
218 |
+
else:
|
219 |
+
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
|
220 |
+
draw('su', 'asr_tamil', filter_1, 'wer')
|
221 |
+
|
222 |
+
|
223 |
+
def asr_indonesian():
|
224 |
+
st.title("Task: Automatic Speech Recognition - Indonesian")
|
225 |
+
|
226 |
+
sum = ['Overall']
|
227 |
+
|
228 |
+
filters_levelone = sum + asr_indonesian_datasets
|
229 |
+
|
230 |
+
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
231 |
+
|
232 |
+
with left:
|
233 |
+
filter_1 = st.selectbox('Dataset', filters_levelone)
|
234 |
+
|
235 |
+
if filter_1:
|
236 |
+
if filter_1 in sum:
|
237 |
+
sum_table_mulit_metrix('asr_indonesian', ['wer'])
|
238 |
+
else:
|
239 |
+
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
|
240 |
+
draw('su', 'asr_indonesian', filter_1, 'wer')
|
241 |
+
|
242 |
+
|
243 |
+
def asr_thai():
|
244 |
+
st.title("Task: Automatic Speech Recognition - Thai")
|
245 |
+
|
246 |
+
sum = ['Overall']
|
247 |
+
|
248 |
+
filters_levelone = sum + asr_thai_datasets
|
249 |
+
|
250 |
+
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
251 |
+
|
252 |
+
with left:
|
253 |
+
filter_1 = st.selectbox('Dataset', filters_levelone)
|
254 |
+
|
255 |
+
if filter_1:
|
256 |
+
if filter_1 in sum:
|
257 |
+
sum_table_mulit_metrix('asr_thai', ['wer'])
|
258 |
+
else:
|
259 |
+
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
|
260 |
+
draw('su', 'asr_thai', filter_1, 'wer')
|
261 |
+
|
262 |
+
|
263 |
+
def asr_vietnamese():
|
264 |
+
st.title("Task: Automatic Speech Recognition - Vietnamese")
|
265 |
+
|
266 |
+
sum = ['Overall']
|
267 |
+
|
268 |
+
filters_levelone = sum + asr_vietnamese_datasets
|
269 |
+
|
270 |
+
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
271 |
+
|
272 |
+
with left:
|
273 |
+
filter_1 = st.selectbox('Dataset', filters_levelone)
|
274 |
+
|
275 |
+
if filter_1:
|
276 |
+
if filter_1 in sum:
|
277 |
+
sum_table_mulit_metrix('asr_vietnamese', ['wer'])
|
278 |
+
else:
|
279 |
+
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
|
280 |
+
draw('su', 'asr_vietnamese', filter_1, 'wer')
|
281 |
|
282 |
|
283 |
def asr_private():
|
process_log.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import sys
|
4 |
+
import json
|
5 |
+
import random
|
6 |
+
|
7 |
+
import pandas as pd
|
8 |
+
import numpy as np
|
9 |
+
|
10 |
+
from app.content import *
|
11 |
+
|
12 |
+
data_to_df = []
|
13 |
+
|
14 |
+
|
15 |
+
log_dir = "path/to/audiobench/log"
|
16 |
+
|
17 |
+
all_evaluated_models = os.listdir(log_dir)
|
18 |
+
for model_name in all_evaluated_models:
|
19 |
+
if "geyu_whisper" in model_name:
|
20 |
+
continue
|
21 |
+
|
22 |
+
if "activation_checkpointing" in model_name:
|
23 |
+
continue
|
24 |
+
|
25 |
+
model_dir = os.path.join(log_dir, model_name)
|
26 |
+
|
27 |
+
if not os.path.isdir(model_dir):
|
28 |
+
continue
|
29 |
+
|
30 |
+
for log_file in os.listdir(model_dir):
|
31 |
+
if not log_file.endswith("score.json"):
|
32 |
+
continue
|
33 |
+
|
34 |
+
match = re.match("^(.*?)_(llama3_70b_judge|wer|bleu)_score.json$", log_file)
|
35 |
+
ds_name = match.group(1)
|
36 |
+
metrics = match.group(2)
|
37 |
+
|
38 |
+
eval_path = os.path.join(model_dir, log_file)
|
39 |
+
|
40 |
+
with open(eval_path, "r") as f:
|
41 |
+
eval_data = json.load(f)
|
42 |
+
|
43 |
+
if metrics == "llama3_70b_judge":
|
44 |
+
value = eval_data[metrics]["judge_score"]
|
45 |
+
elif metrics == "wer":
|
46 |
+
value = eval_data[metrics]
|
47 |
+
elif metrics == "bleu":
|
48 |
+
value = eval_data[metrics]
|
49 |
+
|
50 |
+
data_to_df.append([model_name, ds_name, metrics, value])
|
51 |
+
|
52 |
+
|
53 |
+
eval_result_df = pd.DataFrame(data_to_df, columns=["model", "dataset", "metrics", "value"])
|
54 |
+
eval_result_df["model"] = eval_result_df["model"].replace("MERaLiON_AudioLLM_v1_hf", "MERaLiON-AudioLLM-Whisper-SEA-LION")
|
55 |
+
|
56 |
+
# original results_organized
|
57 |
+
archive_results_dir = "results_organized_archive"
|
58 |
+
output_results_dir = "results_organized"
|
59 |
+
|
60 |
+
|
61 |
+
def merge_results(display_datasets, metrics, result_sub_path=None):
|
62 |
+
raw_ds_names = [displayname2datasetname[dis_name] for dis_name in display_datasets]
|
63 |
+
|
64 |
+
new_result = eval_result_df[eval_result_df["dataset"].isin(raw_ds_names) & (eval_result_df["metrics"] == metrics)]
|
65 |
+
new_result = new_result.drop(columns=["metrics"])
|
66 |
+
new_result = new_result.pivot(index="model", columns="dataset", values="value").reset_index()
|
67 |
+
new_result = new_result.rename(columns={"model": "Model"})
|
68 |
+
new_result = new_result.dropna(axis=0, how="any")
|
69 |
+
|
70 |
+
archive_result_path = os.path.join(archive_results_dir, result_sub_path)
|
71 |
+
if os.path.exists(archive_result_path):
|
72 |
+
archive_result = pd.read_csv(archive_result_path)
|
73 |
+
archive_columns = [col for col in archive_result.columns if col in raw_ds_names]
|
74 |
+
archive_result = archive_result[["Model"] + archive_columns]
|
75 |
+
combined_result = pd.concat([archive_result, new_result], ignore_index=True)
|
76 |
+
combined_result = combined_result.drop_duplicates(subset=["Model"], keep="last", ignore_index=True)
|
77 |
+
|
78 |
+
return new_result, combined_result
|
79 |
+
|
80 |
+
return new_result, new_result
|
81 |
+
|
82 |
+
|
83 |
+
result_file_mapper = {
|
84 |
+
"bleu/st.csv": speech_translation_datasets,
|
85 |
+
"llama3_70b_judge/accent_recognition.csv": ar_datasets,
|
86 |
+
"llama3_70b_judge/audio_captioning.csv": ac_datasets,
|
87 |
+
"llama3_70b_judge/audio_scene_question_answering.csv": asqa_datasets,
|
88 |
+
"llama3_70b_judge/emotion_recognition.csv": er_datasets,
|
89 |
+
"llama3_70b_judge/gender_recognition.csv": gr_datasets,
|
90 |
+
"llama3_70b_judge/music_understanding.csv": music_datasets,
|
91 |
+
"llama3_70b_judge/sds_singlish.csv": sds_datasets,
|
92 |
+
"llama3_70b_judge/speech_instruction.csv": si_datasets,
|
93 |
+
"llama3_70b_judge/sqa_english.csv": speech_qa_english_datasets,
|
94 |
+
"llama3_70b_judge/sqa_singlish.csv": speech_qa_singlish_datasets,
|
95 |
+
"llama3_70b_judge/under_development_llama3_70b_judge.csv": non_wer_development_datasets,
|
96 |
+
"meteor/audio_captioning.csv": ac_datasets,
|
97 |
+
"wer/asr_english.csv": asr_english_datasets,
|
98 |
+
"wer/asr_singlish.csv": asr_singlish_datasets,
|
99 |
+
"wer/asr_mandarin.csv": asr_mandarin_datasets,
|
100 |
+
"wer/asr_malay.csv": asr_malay_datasets,
|
101 |
+
"wer/asr_tamil.csv": asr_tamil_datasets,
|
102 |
+
"wer/asr_indonesian.csv": asr_indonesian_datasets,
|
103 |
+
"wer/asr_thai.csv": asr_thai_datasets,
|
104 |
+
"wer/asr_vietnamese.csv": asr_vietnamese_datasets,
|
105 |
+
"wer/asr_private.csv": asr_private_datasets,
|
106 |
+
"wer/under_development_wer.csv": wer_development_datasets,
|
107 |
+
}
|
108 |
+
|
109 |
+
|
110 |
+
for sub_path, display_ds in result_file_mapper.items():
|
111 |
+
metrics = sub_path.split("/")[0]
|
112 |
+
new_result, combined_result = merge_results(display_ds, metrics, sub_path)
|
113 |
+
|
114 |
+
output_path = os.path.join(output_results_dir, sub_path)
|
115 |
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
116 |
+
combined_result.to_csv(output_path, index=False)
|
results_organized/bleu/st.csv
CHANGED
@@ -9,8 +9,8 @@ WavLLM_fairseq,13.841886973016162,31.96381187282953,0.0033159224040994,5.9335222
|
|
9 |
MERaLiON-AudioLLM-Whisper-SEA-LION,37.058238343330466,43.96331874536172,13.808713343771569,43.37364836260576,19.55610418584389,4.758175879451736
|
10 |
MERaLiON-AudioLLM-v2-2b,30.658188021678257,40.02820084309168,5.601731502002274,37.77329494766737,16.777825775562142,1.9423083468131173
|
11 |
MERaLiON-AudioLLM-v2-9b,36.242124109428445,43.747307981166834,10.885517678613343,47.85937752036512,22.133726547487697,3.4786390367027833
|
12 |
-
Qwen2.5-Omni-3B,
|
13 |
-
Qwen2.5-Omni-7B,
|
14 |
SALMONN_7B,14.193483776951359,33.255550227097565,0.0005121531999434492,27.88515689237341,5.175547389931541,0.40577007761551664
|
15 |
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,27.59161630015759,28.71368811388653,7.474730798912167,46.80524126004861,15.209998552437538,2.8327095799289337
|
16 |
cascade_whisper_large_v3_llama_3_8b_instruct,10.753313930099422,6.089840198985321,1.0029597453865848,46.79744652156276,14.156349261775734,2.4177196689141547
|
|
|
9 |
MERaLiON-AudioLLM-Whisper-SEA-LION,37.058238343330466,43.96331874536172,13.808713343771569,43.37364836260576,19.55610418584389,4.758175879451736
|
10 |
MERaLiON-AudioLLM-v2-2b,30.658188021678257,40.02820084309168,5.601731502002274,37.77329494766737,16.777825775562142,1.9423083468131173
|
11 |
MERaLiON-AudioLLM-v2-9b,36.242124109428445,43.747307981166834,10.885517678613343,47.85937752036512,22.133726547487697,3.4786390367027833
|
12 |
+
Qwen2.5-Omni-3B,22.677415597466005,41.3904266408345,0.11385307865833577,44.70177381121325,21.564197151391852,0.2121024080246949
|
13 |
+
Qwen2.5-Omni-7B,22.381473837803917,40.43638195419091,0.7240804450352291,43.844763499607055,16.686179809018903,0.05656546920236443
|
14 |
SALMONN_7B,14.193483776951359,33.255550227097565,0.0005121531999434492,27.88515689237341,5.175547389931541,0.40577007761551664
|
15 |
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,27.59161630015759,28.71368811388653,7.474730798912167,46.80524126004861,15.209998552437538,2.8327095799289337
|
16 |
cascade_whisper_large_v3_llama_3_8b_instruct,10.753313930099422,6.089840198985321,1.0029597453865848,46.79744652156276,14.156349261775734,2.4177196689141547
|
results_organized/llama3_70b_judge/accent_recognition.csv
CHANGED
@@ -6,12 +6,12 @@ whisper_large_v3,,,
|
|
6 |
old_models,,,
|
7 |
gemini-1.5-flash,,,
|
8 |
WavLLM_fairseq,39.96717275338531,2.6833333333333336,0.2333333333333333
|
9 |
-
MERaLiON-AudioLLM-Whisper-SEA-LION,47.066064833812064
|
10 |
-
MERaLiON-AudioLLM-v2-2b,66.59827656955272
|
11 |
-
MERaLiON-AudioLLM-v2-9b,40.78785391875257
|
12 |
-
Qwen2.5-Omni-3B,0.9027492819039803
|
13 |
-
Qwen2.5-Omni-7B,1.661879359868691
|
14 |
-
SALMONN_7B,31.69881001231022
|
15 |
-
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,28.00574476815757
|
16 |
-
cascade_whisper_large_v3_llama_3_8b_instruct,40.29544521953221
|
17 |
-
phi_4_multimodal_instruct,2.6261797291752154
|
|
|
6 |
old_models,,,
|
7 |
gemini-1.5-flash,,,
|
8 |
WavLLM_fairseq,39.96717275338531,2.6833333333333336,0.2333333333333333
|
9 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,47.066064833812064,6.333333333333334,78.0
|
10 |
+
MERaLiON-AudioLLM-v2-2b,66.59827656955272,59.73333333333334,53.833333333333336
|
11 |
+
MERaLiON-AudioLLM-v2-9b,40.78785391875257,30.325000000000006,54.333333333333336
|
12 |
+
Qwen2.5-Omni-3B,0.9027492819039803,0.1,0.4333333333333333
|
13 |
+
Qwen2.5-Omni-7B,1.661879359868691,0.06666666666666667,0.03333333333333333
|
14 |
+
SALMONN_7B,31.69881001231022,2.833333333333333,0.2
|
15 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,28.00574476815757,38.983333333333334,10.8
|
16 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,40.29544521953221,13.733333333333334,10.166666666666666
|
17 |
+
phi_4_multimodal_instruct,2.6261797291752154,6.133333333333333,0.5333333333333333
|
results_organized/llama3_70b_judge/sqa_english.csv
CHANGED
@@ -9,6 +9,8 @@ WavLLM_fairseq,83.92156862745098,58.54651162790698,77.64903756307233,66.31439894
|
|
9 |
MERaLiON-AudioLLM-Whisper-SEA-LION,86.7156862745098,59.59302325581396,74.20669033825453,57.11140466754734,51.54208050182959,53.1
|
10 |
MERaLiON-AudioLLM-v2-2b,83.18627450980392,69.47674418604652,81.4614090824145,66.00616468516073,61.16048092002091,50.99999999999999
|
11 |
MERaLiON-AudioLLM-v2-9b,89.55882352941177,75.02906976744187,89.20949355260699,84.58828709819463,83.32462101411396,56.699999999999996
|
|
|
|
|
12 |
SALMONN_7B,80.88235294117646,59.38953488372093,65.64754251541768,50.81461911052399,56.56037637219028,50.6
|
13 |
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,51.51960784313726,70.93023255813954,57.16314707531303,89.52003522677234,85.15420805018296,52.6
|
14 |
cascade_whisper_large_v3_llama_3_8b_instruct,86.96078431372548,69.68023255813954,87.43412446271725,84.98458828709819,86.1996863565081,55.900000000000006
|
|
|
9 |
MERaLiON-AudioLLM-Whisper-SEA-LION,86.7156862745098,59.59302325581396,74.20669033825453,57.11140466754734,51.54208050182959,53.1
|
10 |
MERaLiON-AudioLLM-v2-2b,83.18627450980392,69.47674418604652,81.4614090824145,66.00616468516073,61.16048092002091,50.99999999999999
|
11 |
MERaLiON-AudioLLM-v2-9b,89.55882352941177,75.02906976744187,89.20949355260699,84.58828709819463,83.32462101411396,56.699999999999996
|
12 |
+
Qwen2.5-Omni-3B,73.87254901960785,61.07558139534884,59.8504952345356,81.41787758696609,69.99477260846837,60.699999999999996
|
13 |
+
Qwen2.5-Omni-7B,77.30392156862746,61.71511627906977,62.86675387777986,81.72611184500221,70.77888133821223,56.10000000000001
|
14 |
SALMONN_7B,80.88235294117646,59.38953488372093,65.64754251541768,50.81461911052399,56.56037637219028,50.6
|
15 |
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,51.51960784313726,70.93023255813954,57.16314707531303,89.52003522677234,85.15420805018296,52.6
|
16 |
cascade_whisper_large_v3_llama_3_8b_instruct,86.96078431372548,69.68023255813954,87.43412446271725,84.98458828709819,86.1996863565081,55.900000000000006
|
results_organized/wer/asr_english.csv
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
Model,librispeech_test_clean,librispeech_test_other,common_voice_15_en_test,peoples_speech_test,gigaspeech_test,earnings21_test,earnings22_test,tedlium3_test,tedlium3_long_form_test
|
2 |
Qwen-Audio-Chat,0.0202587995623797,0.043467569561352,0.1127242112839891,0.3141914474672335,0.1301891002258773,0.2655529121410546,0.3664994875132684,0.0405237571413363,0.2911540507002305
|
3 |
Qwen2-Audio-7B-Instruct,0.0351416606934017,0.0604157603041594,0.114388725008194,0.2165498391593041,0.1172381289030281,0.1887221931940723,0.2354255566133092,0.06114048472375,0.0873958517993263
|
4 |
-
whisper_large_v3,0.0187874900969555,0.0366012824635405,0.1000186374123559,0.1460242061533738,0.0945902243481269,0.1186395926671187,0.158878997371161,0.0376494801461977,0.032086509484134
|
5 |
old_models,,,,,,,,,
|
6 |
gemini-1.5-flash,,,,,,,,,
|
7 |
WavLLM_fairseq,0.0210321801788206,0.0479883481188643,0.1453332562130063,0.3792176325635977,0.154917784145464,0.6447482518259942,0.6671766188447099,0.0662148255917107,0.4536784258110264
|
@@ -9,10 +8,11 @@ MERaLiON-AudioLLM-Whisper-SEA-LION,0.023937073225940318,0.0422569845082944,0.077
|
|
9 |
MERaLiON-AudioLLM-v2-2b,0.027124910401026145,0.050958064577146425,0.09270505973611995,0.20627055897299626,0.09237908290276242,0.21886082422652334,0.23935918375209228,0.03456229374401192,0.13837971990781775
|
10 |
MERaLiON-AudioLLM-v2-9b,0.02497453502848304,0.046607524542720415,0.08676036786395974,0.20476530792451958,0.09023061553464748,0.1084090226901313,0.15062142184399924,0.03513005216280473,0.043573834426520124
|
11 |
MERaLiON-AudioLLM-v2-9b-asr,0.020956728411363035,0.04040327614579984,0.0761563229028091,0.1957668115250735,0.08768103407213536,0.09210848128425476,0.1277414998676963,0.0313686526383024,0.03495834071973054
|
12 |
-
Qwen2.5-Omni-3B,0.
|
13 |
-
Qwen2.5-Omni-7B,0.
|
14 |
SALMONN_7B,0.09638963292715132,0.11776722719276675,0.315955552984878,0.24158949229136512,0.11024871580815716,0.27733154717568453,0.37956460424973665,0.039352755402576205,0.14139336996986349
|
15 |
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.03299128532085864,0.05381428868670437,0.10610471655066483,0.20285898669536326,0.09994259054523941,0.14091838890062366,0.17187922953626794,0.04939498243497392,0.08636766530756958
|
16 |
cascade_whisper_large_v3_llama_3_8b_instruct,0.018032972422378994,0.035504189759207064,0.09879113887442882,0.14542012514049835,0.09501640807342393,0.10872308256717546,0.1459710229559586,0.038146268762641496,0.04935295160432548
|
17 |
hy_whisper_local_cs,0.029086656354925113,0.05591389713810127,0.1066766923091754,0.17879147486544342,0.10212866235970408,0.14925070316060968,0.17014458107377883,0.04666264504453355,0.06973940790639957
|
18 |
phi_4_multimodal_instruct,0.016844607084920964,0.03851173700039722,0.08109202383018103,0.2147161396912585,0.0988294989332872,0.1306461295594268,0.22572024408764688,0.028636315247862035,0.05062932104236838
|
|
|
|
1 |
Model,librispeech_test_clean,librispeech_test_other,common_voice_15_en_test,peoples_speech_test,gigaspeech_test,earnings21_test,earnings22_test,tedlium3_test,tedlium3_long_form_test
|
2 |
Qwen-Audio-Chat,0.0202587995623797,0.043467569561352,0.1127242112839891,0.3141914474672335,0.1301891002258773,0.2655529121410546,0.3664994875132684,0.0405237571413363,0.2911540507002305
|
3 |
Qwen2-Audio-7B-Instruct,0.0351416606934017,0.0604157603041594,0.114388725008194,0.2165498391593041,0.1172381289030281,0.1887221931940723,0.2354255566133092,0.06114048472375,0.0873958517993263
|
|
|
4 |
old_models,,,,,,,,,
|
5 |
gemini-1.5-flash,,,,,,,,,
|
6 |
WavLLM_fairseq,0.0210321801788206,0.0479883481188643,0.1453332562130063,0.3792176325635977,0.154917784145464,0.6447482518259942,0.6671766188447099,0.0662148255917107,0.4536784258110264
|
|
|
8 |
MERaLiON-AudioLLM-v2-2b,0.027124910401026145,0.050958064577146425,0.09270505973611995,0.20627055897299626,0.09237908290276242,0.21886082422652334,0.23935918375209228,0.03456229374401192,0.13837971990781775
|
9 |
MERaLiON-AudioLLM-v2-9b,0.02497453502848304,0.046607524542720415,0.08676036786395974,0.20476530792451958,0.09023061553464748,0.1084090226901313,0.15062142184399924,0.03513005216280473,0.043573834426520124
|
10 |
MERaLiON-AudioLLM-v2-9b-asr,0.020956728411363035,0.04040327614579984,0.0761563229028091,0.1957668115250735,0.08768103407213536,0.09210848128425476,0.1277414998676963,0.0313686526383024,0.03495834071973054
|
11 |
+
Qwen2.5-Omni-3B,0.021107631946278342,0.04492405470331209,0.0940546654584482,0.2615060102759792,0.11446542772550759,0.14654089448699847,0.19688006593894564,0.04804655619034101,0.07147668853040241
|
12 |
+
Qwen2.5-Omni-7B,0.04404496925340476,0.06877636332683905,0.08028226039678409,0.3124105638254503,0.13967544855837088,0.18939756089426465,0.24105023789319796,0.049146588126752065,0.08381492643148378
|
13 |
SALMONN_7B,0.09638963292715132,0.11776722719276675,0.315955552984878,0.24158949229136512,0.11024871580815716,0.27733154717568453,0.37956460424973665,0.039352755402576205,0.14139336996986349
|
14 |
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.03299128532085864,0.05381428868670437,0.10610471655066483,0.20285898669536326,0.09994259054523941,0.14091838890062366,0.17187922953626794,0.04939498243497392,0.08636766530756958
|
15 |
cascade_whisper_large_v3_llama_3_8b_instruct,0.018032972422378994,0.035504189759207064,0.09879113887442882,0.14542012514049835,0.09501640807342393,0.10872308256717546,0.1459710229559586,0.038146268762641496,0.04935295160432548
|
16 |
hy_whisper_local_cs,0.029086656354925113,0.05591389713810127,0.1066766923091754,0.17879147486544342,0.10212866235970408,0.14925070316060968,0.17014458107377883,0.04666264504453355,0.06973940790639957
|
17 |
phi_4_multimodal_instruct,0.016844607084920964,0.03851173700039722,0.08109202383018103,0.2147161396912585,0.0988294989332872,0.1306461295594268,0.22572024408764688,0.028636315247862035,0.05062932104236838
|
18 |
+
whisper_large_v3,0.02240917493492285,0.03928726805001229,0.09987082345229144,0.15004646142216516,0.09818771638225171,0.13208580227012484,0.16471049822226413,0.04130442496717646,0.04538202446374756
|
results_organized/wer/asr_indonesian.csv
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,commonvoice_17_id_asr,gigaspeech2_id_test
|
2 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,0.25954549636581103,0.337184855698226
|
3 |
+
MERaLiON-AudioLLM-v2-2b,0.08547244456711749,0.17842684134623737
|
4 |
+
MERaLiON-AudioLLM-v2-9b,0.11334989419449812,0.1722759890883186
|
5 |
+
MERaLiON-AudioLLM-v2-9b-asr,0.07921611923820039,0.16282383194620612
|
6 |
+
Qwen2.5-Omni-3B,0.13579906155120067,0.2746338157871875
|
7 |
+
Qwen2.5-Omni-7B,0.10994571717729322,0.22737303007662502
|
8 |
+
SALMONN_7B,1.1888858220627472,2.1181172136986777
|
9 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.09977918851780293,0.2191718937327333
|
10 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,0.07815806421933941,0.1926224523482703
|
11 |
+
hy_whisper_local_cs,0.10267733922163952,0.21382030476256667
|
12 |
+
phi_4_multimodal_instruct,1.327169012788665,5.803850364012302
|
13 |
+
whisper_large_v3,0.07512190633912963,0.1961496359876983
|
results_organized/wer/asr_malay.csv
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,ytb_asr_batch3_malay
|
2 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,0.28989513404414025
|
3 |
+
MERaLiON-AudioLLM-v2-2b,0.2798911851169321
|
4 |
+
MERaLiON-AudioLLM-v2-9b,0.20907375718485366
|
5 |
+
MERaLiON-AudioLLM-v2-9b-asr,0.19463823439076827
|
6 |
+
Qwen2.5-Omni-3B,2.943749725768944
|
7 |
+
Qwen2.5-Omni-7B,1.4606642973103419
|
8 |
+
SALMONN_7B,1.0858672282918695
|
9 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.3143784827344127
|
10 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,0.3119213724715897
|
11 |
+
hy_whisper_local_cs,0.2421569917950068
|
12 |
+
phi_4_multimodal_instruct,3.762932736606555
|
13 |
+
whisper_large_v3,0.259620025448642
|
results_organized/wer/asr_mandarin.csv
CHANGED
@@ -1,18 +1,18 @@
|
|
1 |
-
Model,aishell_asr_zh_test,commonvoice_zh_asr
|
2 |
-
Qwen-Audio-Chat,0.9469917443725128
|
3 |
-
Qwen2-Audio-7B-Instruct,0.0926035912969452
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
MERaLiON-AudioLLM-
|
9 |
-
MERaLiON-AudioLLM-v2-
|
10 |
-
MERaLiON-AudioLLM-v2-9b,0.
|
11 |
-
|
12 |
-
Qwen2.5-Omni-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
1 |
+
Model,aishell_asr_zh_test,commonvoice_zh_asr,ytb_asr_batch3_chinese
|
2 |
+
Qwen-Audio-Chat,0.9469917443725128,,
|
3 |
+
Qwen2-Audio-7B-Instruct,0.0926035912969452,,
|
4 |
+
old_models,,,
|
5 |
+
gemini-1.5-flash,,,
|
6 |
+
WavLLM_fairseq,0.7054601967888183,,
|
7 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,0.12846706657955692,0.3269799259362027,0.418102808691044
|
8 |
+
MERaLiON-AudioLLM-v2-2b,0.05010789728969927,0.13139387212789344,0.25613142554319024
|
9 |
+
MERaLiON-AudioLLM-v2-9b,0.05789827958266516,0.14684695260557293,0.19133015368309486
|
10 |
+
MERaLiON-AudioLLM-v2-9b-asr,0.043317297222387204,0.1183419954537208,0.1494223635400106
|
11 |
+
Qwen2.5-Omni-3B,0.02807309298964582,0.11308069111981474,0.25013248542660305
|
12 |
+
Qwen2.5-Omni-7B,0.02438082793846885,0.07647567313746625,0.20640169581346052
|
13 |
+
SALMONN_7B,0.9314703727900854,1.0013340021130595,0.8858293587705353
|
14 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.20889509215814378,0.31938144990021666,0.3469210386857446
|
15 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,0.12450753301261111,0.1962263748225777,0.2698675145733969
|
16 |
+
hy_whisper_local_cs,0.15675793391538476,0.287290695068461,0.27520932697403283
|
17 |
+
phi_4_multimodal_instruct,0.12232978955079092,0.154221316286565,0.44008479067302597
|
18 |
+
whisper_large_v3,0.1233691671121142,0.19822204198371451,0.2663275039745628
|
results_organized/wer/asr_private.csv
CHANGED
@@ -1,12 +1,13 @@
|
|
1 |
-
Model,cna_test,idpc_short_test,idpc_test,mediacorp_short_test,mediacorp_test,parliament_test,ukusnews_test,ytb_asr_batch1,ytb_asr_batch2
|
2 |
-
MERaLiON-AudioLLM-Whisper-SEA-LION,0.14503898323187012,0.16498433693003828,0.20359281437125748,0.12828873397796267,0.12250898399215943,0.058780395496262655,0.1128757799205899,0.10724437274333563,0.13268461455292463
|
3 |
-
MERaLiON-AudioLLM-v2-2b,0.13494606429563175,0.15106160807518274,0.17741659538066723,0.1208680008994828,0.12250898399215943,0.18544800832623712,0.17383248251087163,0.09933164323576861,0.15990917937074278
|
4 |
-
MERaLiON-AudioLLM-v2-9b,0.13334401367083198,0.15663069961712495,0.16030795551753635,0.11693276366089499,0.10454099967330938,0.06024694862333239,0.06972962752883342,0.09848659445340709,0.1110174072872743
|
5 |
-
MERaLiON-AudioLLM-v2-9b-asr,0.12709601623411299,0.14009745910198398,0.16612489307100087,0.11783224645828648,0.10372427311336165,0.05284322073989971,0.055965210814898844,0.09230237381885227,0.09936209319926478
|
6 |
-
Qwen2.5-Omni-3B,0.
|
7 |
-
Qwen2.5-Omni-7B,0.
|
8 |
-
SALMONN_7B,0.1492577165438428,0.2398190045248869,0.5414884516680923,0.19901056892286936,0.3636883371447239,0.20430031223389156,0.191869918699187,0.2207497887378044,0.3495513028435506
|
9 |
-
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.15171419416853574,0.19735468151757746,0.17040205303678357,0.1541488644029683,0.15754655341391702,0.09007474690131517,0.12278313480809226,0.12475992932319274,0.12552708400908205
|
10 |
-
cascade_whisper_large_v3_llama_3_8b_instruct,0.13815016554523124,0.15344926428434932,0.16184773310521813,0.11434675061839443,0.15125775890231952,0.06537988456807645,0.08943089430894309,0.10816624414227549,0.08387933830684398
|
11 |
-
hy_whisper_local_cs,0.14674783723165652,0.18308388444135051,0.17570573139435414,0.12885091072633237,0.1256125449199608,0.07257072570725707,0.16948383437322745,0.1284858262272413,0.14315061087685155
|
12 |
-
phi_4_multimodal_instruct,0.19080422941364947,0.5388096066829099,0.26073567151411464,0.1217674836968743,0.19813786344331918,0.2778645094143249,0.07521270561542824,0.16939386955519706,0.23232781922369986
|
|
|
|
1 |
+
Model,cna_test,idpc_short_test,idpc_test,mediacorp_short_test,mediacorp_test,parliament_test,ukusnews_test,ytb_asr_batch1,ytb_asr_batch2
|
2 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,0.14503898323187012,0.16498433693003828,0.20359281437125748,0.12828873397796267,0.12250898399215943,0.058780395496262655,0.1128757799205899,0.10724437274333563,0.13268461455292463
|
3 |
+
MERaLiON-AudioLLM-v2-2b,0.13494606429563175,0.15106160807518274,0.17741659538066723,0.1208680008994828,0.12250898399215943,0.18544800832623712,0.17383248251087163,0.09933164323576861,0.15990917937074278
|
4 |
+
MERaLiON-AudioLLM-v2-9b,0.13334401367083198,0.15663069961712495,0.16030795551753635,0.11693276366089499,0.10454099967330938,0.06024694862333239,0.06972962752883342,0.09848659445340709,0.1110174072872743
|
5 |
+
MERaLiON-AudioLLM-v2-9b-asr,0.12709601623411299,0.14009745910198398,0.16612489307100087,0.11783224645828648,0.10372427311336165,0.05284322073989971,0.055965210814898844,0.09230237381885227,0.09936209319926478
|
6 |
+
Qwen2.5-Omni-3B,0.17398269785325216,0.2111033762617473,0.19863130881094954,0.1476276141218799,0.1515844495262986,0.10048254328697133,0.09075439591605218,0.1622877775217024,0.24454535625473023
|
7 |
+
Qwen2.5-Omni-7B,0.18322118978959734,0.41385311521058127,0.22035928143712574,0.14144366988981336,0.23546226723293043,0.11022802535717664,0.176214785403668,0.17361911346700468,0.35052438101416367
|
8 |
+
SALMONN_7B,0.1492577165438428,0.2398190045248869,0.5414884516680923,0.19901056892286936,0.3636883371447239,0.20430031223389156,0.191869918699187,0.2207497887378044,0.3495513028435506
|
9 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.15171419416853574,0.19735468151757746,0.17040205303678357,0.1541488644029683,0.15754655341391702,0.09007474690131517,0.12278313480809226,0.12475992932319274,0.12552708400908205
|
10 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,0.13815016554523124,0.15344926428434932,0.16184773310521813,0.11434675061839443,0.15125775890231952,0.06537988456807645,0.08943089430894309,0.10816624414227549,0.08387933830684398
|
11 |
+
hy_whisper_local_cs,0.14674783723165652,0.18308388444135051,0.17570573139435414,0.12885091072633237,0.1256125449199608,0.07257072570725707,0.16948383437322745,0.1284858262272413,0.14315061087685155
|
12 |
+
phi_4_multimodal_instruct,0.19080422941364947,0.5388096066829099,0.26073567151411464,0.1217674836968743,0.19813786344331918,0.2778645094143249,0.07521270561542824,0.16939386955519706,0.23232781922369986
|
13 |
+
whisper_large_v3,0.1376695503577913,0.2201531500174034,0.1787852865697177,0.12671463908252756,0.12904279647174127,0.08962531933011637,0.12263187748156551,0.13336406237996465,0.1289869175045951
|
results_organized/wer/asr_sea.csv
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
Model,commonvoice_17_id_asr,commonvoice_17_ta_asr,commonvoice_17_vi_asr,fleurs_tamil_ta_30_asr,gigaspeech2_id_test,gigaspeech2_th_test,gigaspeech2_vi_test,lotus_thai_th_30_asr
|
2 |
-
MERaLiON-AudioLLM-Whisper-SEA-LION,0.25954549636581103,0.5284951114826634,0.9221892864704637,0.4624736472241743,0.337184855698226,0.9866395307075302,0.9818897503814326,0.8520208370756243
|
3 |
-
MERaLiON-AudioLLM-v2-2b,0.08547244456711749,0.13853008043879414,0.14196485284776625,0.1432185523541813,0.17842684134623737,0.19968394588770502,0.16825573283269715,0.014873360876594216
|
4 |
-
MERaLiON-AudioLLM-v2-9b,0.11334989419449812,0.15591770571023683,0.15646834639000634,0.16085734364019677,0.1722759890883186,0.20004788698671136,0.11314793912959634,0.018681516076881625
|
5 |
-
MERaLiON-AudioLLM-v2-9b-asr,0.07921611923820039,0.12871226564172622,0.1423883125132331,0.1383345045678145,0.16282383194620612,0.18238237758889023,0.09499798648962901,0.010670019759295851
|
6 |
-
Qwen2.5-Omni-3B,0.13731714049130556,1.0276387288835422,0.2463476603853483,1.3477160927617708,0.3110002953799107,0.4670274152998923,0.19581530154444754,0.4822705227231902
|
7 |
-
Qwen2.5-Omni-7B,0.18235348238108381,1.0684188526512177,0.22041075587550285,1.2090302178496135,0.26146334682814104,0.2936956781994493,0.22408385278119664,0.0984012933357284
|
8 |
-
SALMONN_7B,1.1888858220627472,1.4272941368377052,1.496294727927165,1.507519325368939,2.1181172136986777,1.2470441757452413,1.5460526688938172,1.1351535836177475
|
9 |
-
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.09977918851780293,0.23805397249380653,0.1567859411391065,0.2724525650035137,0.2191718937327333,0.276058900993655,0.17136958408249153,0.06815160768816239
|
10 |
-
cascade_whisper_large_v3_llama_3_8b_instruct,0.07815806421933941,0.24404355317218387,0.11676900275248782,0.28397751229796203,0.1926224523482703,0.20872022028013887,0.15538061017872032,0.031794503323154304
|
11 |
-
hy_whisper_local_cs,0.10267733922163952,0.31793713743921215,0.1681134871903451,0.33113141250878425,0.21382030476256667,0.26486292350053875,0.1781020821398794,0.076019400035926
|
12 |
-
phi_4_multimodal_instruct,1.327169012788665,1.1784589191228196,1.1070294304467498,1.7016514406184118,5.803850364012302,1.7344522925894887,2.5042567310800923,1.2856834920064666
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results_organized/wer/asr_singlish.csv
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
Model,imda_part1_asr_test,imda_part2_asr_test,imda_part3_30s_asr_test,imda_part4_30s_asr_test,imda_part5_30s_asr_test,imda_part6_30s_asr_test
|
2 |
Qwen-Audio-Chat,0.1055031331529027,0.4547926304683061,0.6412550574306894,1.173131813552289,0.3016882870525747,0.3139424086306303
|
3 |
Qwen2-Audio-7B-Instruct,0.0719771779679613,0.1905689473257041,0.3507616694273223,0.5613424034000176,0.2785600677065853,0.2245352799625317
|
4 |
-
whisper_large_v3,0.0684417136030039,0.3171008846684522,0.2702636652456078,0.4618189591218298,0.2143555471246589,0.1698509342851144
|
5 |
old_models,,,,,,
|
6 |
gemini-1.5-flash,,,,,,
|
7 |
WavLLM_fairseq,0.1007729256577182,0.4463923382842302,0.7540934640345399,1.143645714142011,0.3979658840524726,0.4254106170965293
|
@@ -9,10 +8,11 @@ MERaLiON-AudioLLM-Whisper-SEA-LION,0.04303513520103382,0.0473581689797906,0.2129
|
|
9 |
MERaLiON-AudioLLM-v2-2b,0.049057615877892376,0.05819332846359873,0.26414044043772233,0.3595795244502006,0.20202536078562985,0.1493725673864242
|
10 |
MERaLiON-AudioLLM-v2-9b,0.051959134908443665,0.14532099667234802,0.22654574089662477,0.2948987161915779,0.16760298259181977,0.12655243140231592
|
11 |
MERaLiON-AudioLLM-v2-9b-asr,0.04362031550971643,0.054094635175716256,0.19622831075026476,0.24570911239925058,0.1403598371539887,0.0989680065892537
|
12 |
-
Qwen2.5-Omni-3B,0.
|
13 |
-
Qwen2.5-Omni-7B,0.
|
14 |
SALMONN_7B,0.09275107892619414,0.45783621459297136,0.681280039101746,0.7865181254636674,0.37533379054734356,0.25522053004731987
|
15 |
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.07053860970911661,0.3298433568703839,0.2810437993863198,0.4594298934979693,0.21829536997854984,0.17514817745764627
|
16 |
cascade_whisper_large_v3_llama_3_8b_instruct,0.06922195401458074,0.31912994075156237,0.2770250088250468,0.4581096203900464,0.21391778902978215,0.1722411537654032
|
17 |
hy_whisper_local_cs,0.06692999780557385,0.2735167600032465,0.25580416542210876,0.3612895924757007,0.186411988735025,0.14417222500363377
|
18 |
phi_4_multimodal_instruct,0.057615877892375586,0.3451018586153721,0.4381839411301491,1.4697028756805695,0.23859275364433613,0.1439784234241509
|
|
|
|
1 |
Model,imda_part1_asr_test,imda_part2_asr_test,imda_part3_30s_asr_test,imda_part4_30s_asr_test,imda_part5_30s_asr_test,imda_part6_30s_asr_test
|
2 |
Qwen-Audio-Chat,0.1055031331529027,0.4547926304683061,0.6412550574306894,1.173131813552289,0.3016882870525747,0.3139424086306303
|
3 |
Qwen2-Audio-7B-Instruct,0.0719771779679613,0.1905689473257041,0.3507616694273223,0.5613424034000176,0.2785600677065853,0.2245352799625317
|
|
|
4 |
old_models,,,,,,
|
5 |
gemini-1.5-flash,,,,,,
|
6 |
WavLLM_fairseq,0.1007729256577182,0.4463923382842302,0.7540934640345399,1.143645714142011,0.3979658840524726,0.4254106170965293
|
|
|
8 |
MERaLiON-AudioLLM-v2-2b,0.049057615877892376,0.05819332846359873,0.26414044043772233,0.3595795244502006,0.20202536078562985,0.1493725673864242
|
9 |
MERaLiON-AudioLLM-v2-9b,0.051959134908443665,0.14532099667234802,0.22654574089662477,0.2948987161915779,0.16760298259181977,0.12655243140231592
|
10 |
MERaLiON-AudioLLM-v2-9b-asr,0.04362031550971643,0.054094635175716256,0.19622831075026476,0.24570911239925058,0.1403598371539887,0.0989680065892537
|
11 |
+
Qwen2.5-Omni-3B,0.05298320044863824,0.0947975002029056,0.47520840687539034,1.2504495215581737,0.27988793392771155,0.18302944168994978
|
12 |
+
Qwen2.5-Omni-7B,0.05291005291005291,0.09410762113464816,0.5354359573139272,1.3034993524374756,0.37375786140578715,0.27471373891697215
|
13 |
SALMONN_7B,0.09275107892619414,0.45783621459297136,0.681280039101746,0.7865181254636674,0.37533379054734356,0.25522053004731987
|
14 |
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.07053860970911661,0.3298433568703839,0.2810437993863198,0.4594298934979693,0.21829536997854984,0.17514817745764627
|
15 |
cascade_whisper_large_v3_llama_3_8b_instruct,0.06922195401458074,0.31912994075156237,0.2770250088250468,0.4581096203900464,0.21391778902978215,0.1722411537654032
|
16 |
hy_whisper_local_cs,0.06692999780557385,0.2735167600032465,0.25580416542210876,0.3612895924757007,0.186411988735025,0.14417222500363377
|
17 |
phi_4_multimodal_instruct,0.057615877892375586,0.3451018586153721,0.4381839411301491,1.4697028756805695,0.23859275364433613,0.1439784234241509
|
18 |
+
whisper_large_v3,0.06853924365445102,0.3183183183183183,0.31976538952399053,0.5026468332306454,0.23660825028089477,0.19798446357337812
|
results_organized/wer/asr_tamil.csv
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,commonvoice_17_ta_asr,fleurs_tamil_ta_30_asr,ytb_asr_batch3_tamil
|
2 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,0.5284951114826634,0.4624736472241743,0.6929759165018962
|
3 |
+
MERaLiON-AudioLLM-v2-2b,0.13853008043879414,0.1432185523541813,0.7504943113675407
|
4 |
+
MERaLiON-AudioLLM-v2-9b,0.15591770571023683,0.16085734364019677,0.6644679264853651
|
5 |
+
MERaLiON-AudioLLM-v2-9b-asr,0.12871226564172622,0.1383345045678145,0.5467894071504975
|
6 |
+
Qwen2.5-Omni-3B,0.8307319012713203,1.653935347856641,1.4607630222683219
|
7 |
+
Qwen2.5-Omni-7B,0.8465494917777076,0.8666549543218552,1.3615441962983372
|
8 |
+
SALMONN_7B,1.4272941368377052,1.507519325368939,0.985267900554277
|
9 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.23805397249380653,0.2724525650035137,0.9665002755178114
|
10 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,0.24404355317218387,0.28397751229796203,0.8976532365239376
|
11 |
+
hy_whisper_local_cs,0.31793713743921215,0.33113141250878425,0.8339924151567211
|
12 |
+
phi_4_multimodal_instruct,1.1784589191228196,1.7016514406184118,2.7500567242552916
|
13 |
+
whisper_large_v3,0.2713203584572879,0.276317638791286,0.8413665683446242
|
results_organized/wer/asr_thai.csv
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,gigaspeech2_th_test,lotus_thai_th_30_asr
|
2 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,0.9866395307075302,0.8520208370756243
|
3 |
+
MERaLiON-AudioLLM-v2-2b,0.19968394588770502,0.014873360876594216
|
4 |
+
MERaLiON-AudioLLM-v2-9b,0.20004788698671136,0.018681516076881625
|
5 |
+
MERaLiON-AudioLLM-v2-9b-asr,0.18238237758889023,0.010670019759295851
|
6 |
+
Qwen2.5-Omni-3B,0.3000742248294026,0.026225974492545358
|
7 |
+
Qwen2.5-Omni-7B,0.23150963725607565,0.021483743488413868
|
8 |
+
SALMONN_7B,1.2470441757452413,1.1351535836177475
|
9 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.276058900993655,0.06815160768816239
|
10 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,0.20872022028013887,0.031794503323154304
|
11 |
+
hy_whisper_local_cs,0.26486292350053875,0.076019400035926
|
12 |
+
phi_4_multimodal_instruct,1.7344522925894887,1.2856834920064666
|
13 |
+
whisper_large_v3,0.22202801388722615,0.03933896173881803
|
results_organized/wer/asr_vietnamese.csv
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,commonvoice_17_vi_asr,gigaspeech2_vi_test
|
2 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,0.9221892864704637,0.9818897503814326
|
3 |
+
MERaLiON-AudioLLM-v2-2b,0.14196485284776625,0.16825573283269715
|
4 |
+
MERaLiON-AudioLLM-v2-9b,0.15646834639000634,0.11314793912959634
|
5 |
+
MERaLiON-AudioLLM-v2-9b-asr,0.1423883125132331,0.09499798648962901
|
6 |
+
Qwen2.5-Omni-3B,0.19648528477662502,0.17708681916408126
|
7 |
+
Qwen2.5-Omni-7B,0.18367562989625238,0.22730546937479085
|
8 |
+
SALMONN_7B,1.496294727927165,1.5460526688938172
|
9 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.1567859411391065,0.17136958408249153
|
10 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,0.11676900275248782,0.15538061017872032
|
11 |
+
hy_whisper_local_cs,0.1681134871903451,0.1781020821398794
|
12 |
+
phi_4_multimodal_instruct,1.1070294304467498,2.5042567310800923
|
13 |
+
whisper_large_v3,0.12873173830192675,0.17700741312128138
|
results_organized_archive/bleu/st.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,covost2_en_id_test,covost2_en_zh_test,covost2_en_ta_test,covost2_id_en_test,covost2_zh_en_test,covost2_ta_en_test
|
2 |
+
Qwen-Audio-Chat,4.102230932924371,15.330641138043728,0.03451483807236294,0.45648619714728844,9.898238298955656,0.01699144301093184
|
3 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,37.60224687716629,43.941098854450516,14.407399367512914,44.43289180618449,18.76473995941838,5.023057608950299
|
4 |
+
hy_whisper_local_cs,1.0869208512565696,0.10573269629215352,0.008950516549431693,22.267131378964944,7.31707791416422,2.8610263518826757
|
5 |
+
Qwen2-Audio-7B-Instruct,16.325186897428104,25.765420247070075,0.03245972071872916,6.326113431899141,16.466557744958333,0.04425838146050298
|
6 |
+
whisper_large_v3,1.600581653970121,0.16408986541757878,0.02107778621423822,46.01512198258627,14.673689493155793,2.451098639578599
|
7 |
+
old_models,,,,,,
|
8 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,27.620150160643625,35.274306071307024,8.433062902024755,46.80524126004861,15.209998552437538,2.8327095799289337
|
9 |
+
gemini-1.5-flash,,,,,,
|
10 |
+
WavLLM_fairseq,13.841886973016162,31.96381187282953,0.0033159224040994286,5.933522277713613,2.368659001743569,0.1695522548322915
|
11 |
+
SALMONN_7B,14.102682915273142,33.88941292215531,0.00046745670226766583,26.89649039333571,5.296039450108202,0.3649023706010388
|
12 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,10.930203684508578,5.987143868370054,1.0368044741318085,46.79924664837527,14.154700735606419,2.4245628096245917
|
results_organized_archive/llama3_70b_judge/accent_recognition.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,voxceleb_accent_test,imda_ar_sentence,imda_ar_dialogue
|
2 |
+
Qwen-Audio-Chat,48.05088223225277,3.933333333333333,0.6666666666666667
|
3 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,47.01682396389003,7.816666666666666,77.83333333333333
|
4 |
+
hy_whisper_local_cs,,,
|
5 |
+
Qwen2-Audio-7B-Instruct,29.187525646286417,2.55,0.9666666666666667
|
6 |
+
whisper_large_v3,,,
|
7 |
+
old_models,,,
|
8 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,24.640951990151827,26.016666666666666,7.633333333333334
|
9 |
+
gemini-1.5-flash,,,
|
10 |
+
WavLLM_fairseq,39.96717275338531,2.6833333333333336,0.23333333333333336
|
11 |
+
SALMONN_7B,34.222404595814524,2.5166666666666666,0.06666666666666667
|
12 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,39.32704144439885,12.416666666666666,9.666666666666666
|
results_organized_archive/llama3_70b_judge/audio_captioning.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,audiocaps_test,wavcaps_test
|
2 |
+
Qwen-Audio-Chat,47.04090909090909,32.9364161849711
|
3 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,38.00454545454545,33.97687861271676
|
4 |
+
hy_whisper_local_cs,,
|
5 |
+
Qwen2-Audio-7B-Instruct,40.77727272727273,33.78034682080925
|
6 |
+
whisper_large_v3,,
|
7 |
+
old_models,,
|
8 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,3.0954545454545457,6.3468208092485545
|
9 |
+
gemini-1.5-flash,,
|
10 |
+
WavLLM_fairseq,5.5,6.901734104046243
|
11 |
+
SALMONN_7B,37.445454545454545,23.76878612716763
|
12 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,2.4727272727272727,3.445086705202312
|
results_organized_archive/llama3_70b_judge/audio_scene_question_answering.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,clotho_aqa_test,audiocaps_qa_test,wavcaps_qa_test
|
2 |
+
Qwen-Audio-Chat,61.934856587263,50.22364217252396,42.69736842105263
|
3 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,63.15021876519203,49.77635782747604,46.31578947368421
|
4 |
+
hy_whisper_local_cs,,,
|
5 |
+
Qwen2-Audio-7B-Instruct,50.919591292758774,45.75079872204473,44.473684210526315
|
6 |
+
whisper_large_v3,,,
|
7 |
+
old_models,,,
|
8 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,24.647544968400585,18.466453674121407,18.88157894736842
|
9 |
+
gemini-1.5-flash,,,
|
10 |
+
WavLLM_fairseq,43.01199466903598,29.840255591054312,26.25
|
11 |
+
SALMONN_7B,57.75401069518716,50.287539936102235,47.30263157894737
|
12 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,29.47134606841404,17.380191693290733,16.710526315789473
|
results_organized_archive/llama3_70b_judge/emotion_recognition.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,iemocap_emotion_test,meld_sentiment_test,meld_emotion_test
|
2 |
+
Qwen-Audio-Chat,29.382470119521916,44.90421455938697,50.72796934865901
|
3 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,48.505976095617534,46.206896551724135,36.36015325670498
|
4 |
+
hy_whisper_local_cs,,,
|
5 |
+
Qwen2-Audio-7B-Instruct,53.98406374501992,53.9463601532567,41.60919540229885
|
6 |
+
whisper_large_v3,,,
|
7 |
+
old_models,,,
|
8 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,44.322709163346616,56.59003831417625,47.356321839080465
|
9 |
+
gemini-1.5-flash,,,
|
10 |
+
WavLLM_fairseq,59.76095617529881,51.072796934865906,41.57088122605364
|
11 |
+
SALMONN_7B,23.804780876494025,41.7624521072797,30.536398467432953
|
12 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,46.713147410358566,45.593869731800766,36.81992337164751
|
results_organized_archive/llama3_70b_judge/gender_recognition.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,voxceleb_gender_test,iemocap_gender_test,imda_gr_sentence,imda_gr_dialogue
|
2 |
+
Qwen-Audio-Chat,70.5990972507181,50.0996015936255,57.550000000000004,37.2
|
3 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,99.75379565038982,93.48605577689243,66.13333333333333,93.76666666666667
|
4 |
+
hy_whisper_local_cs,,,,
|
5 |
+
Qwen2-Audio-7B-Instruct,99.1177677472302,92.80876494023903,68.38333333333333,61.56666666666667
|
6 |
+
whisper_large_v3,,,,
|
7 |
+
old_models,,,,
|
8 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,34.94050061551087,15.737051792828685,26.35,19.6
|
9 |
+
gemini-1.5-flash,,,,
|
10 |
+
WavLLM_fairseq,69.61427985227739,51.932270916334666,49.06666666666666,46.766666666666666
|
11 |
+
SALMONN_7B,88.79770209273697,81.31474103585658,59.766666666666666,42.733333333333334
|
12 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,42.921624948707425,44.22310756972111,36.016666666666666,25.433333333333337
|
results_organized_archive/llama3_70b_judge/music_understanding.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,muchomusic_test
|
2 |
+
Qwen-Audio-Chat,59.0564448188711
|
3 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,57.7927548441449
|
4 |
+
hy_whisper_local_cs,
|
5 |
+
Qwen2-Audio-7B-Instruct,71.60909856781802
|
6 |
+
whisper_large_v3,
|
7 |
+
old_models,
|
8 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,51.727042965459134
|
9 |
+
gemini-1.5-flash,
|
10 |
+
WavLLM_fairseq,44.3133951137321
|
11 |
+
SALMONN_7B,50.88458298230834
|
12 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,56.44481887110362
|
results_organized_archive/llama3_70b_judge/sds_singlish.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,imda_part3_30s_ds_human_test,imda_part4_30s_ds_human_test,imda_part5_30s_ds_human_test,imda_part6_30s_ds_human_test
|
2 |
+
Qwen-Audio-Chat,16.4,16.0,28.2,40.4
|
3 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,48.4,46.4,57.0,62.599999999999994
|
4 |
+
hy_whisper_local_cs,,,,
|
5 |
+
Qwen2-Audio-7B-Instruct,33.8,24.8,40.4,46.2
|
6 |
+
whisper_large_v3,,,,
|
7 |
+
old_models,,,,
|
8 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,45.4,44.0,58.0,65.4
|
9 |
+
gemini-1.5-flash,,,,
|
10 |
+
WavLLM_fairseq,31.6,31.6,45.199999999999996,49.400000000000006
|
11 |
+
SALMONN_7B,9.0,7.0,17.2,24.2
|
12 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,37.400000000000006,36.0,49.0,57.199999999999996
|
results_organized_archive/llama3_70b_judge/speech_instruction.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,openhermes_audio_test,alpaca_audio_test
|
2 |
+
Qwen-Audio-Chat,10.600000000000001,9.8
|
3 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,65.6,74.80000000000001
|
4 |
+
hy_whisper_local_cs,,
|
5 |
+
Qwen2-Audio-7B-Instruct,44.800000000000004,52.599999999999994
|
6 |
+
whisper_large_v3,,
|
7 |
+
old_models,,
|
8 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,72.2,73.8
|
9 |
+
gemini-1.5-flash,,
|
10 |
+
WavLLM_fairseq,19.2,21.6
|
11 |
+
SALMONN_7B,15.8,17.2
|
12 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,63.0,70.8
|
results_organized_archive/llama3_70b_judge/sqa_english.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,slue_p2_sqa5_test,public_sg_speech_qa_test,spoken_squad_test,cn_college_listen_mcq_test,dream_tts_mcq_test
|
2 |
+
Qwen-Audio-Chat,79.36274509803921,63.16860465116279,64.8327415436367,63.232056362835756,59.749085206481965
|
3 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,86.76470588235293,59.7093023255814,73.66473556344609,88.50726552179657,84.31782540512285
|
4 |
+
hy_whisper_local_cs,,,,,
|
5 |
+
Qwen2-Audio-7B-Instruct,80.04901960784315,58.31395348837209,64.86264249672958,74.7247908410392,66.49242028227914
|
6 |
+
whisper_large_v3,,,,,
|
7 |
+
old_models,,,,,
|
8 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,88.57843137254902,73.11046511627907,88.61894972902262,91.85380889476001,89.33612127548353
|
9 |
+
gemini-1.5-flash,,,,89.25583443416997,
|
10 |
+
WavLLM_fairseq,83.92156862745098,58.54651162790698,77.64903756307233,66.31439894319684,66.5446941975954
|
11 |
+
SALMONN_7B,83.48039215686273,59.24418604651163,66.39506634273968,50.99075297225891,56.455828541557764
|
12 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,82.99019607843137,64.94186046511628,83.81984675761541,85.2928225451343,86.4610559330894
|
results_organized_archive/llama3_70b_judge/sqa_singlish.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,imda_part3_30s_sqa_human_test,imda_part4_30s_sqa_human_test,imda_part5_30s_sqa_human_test,imda_part6_30s_sqa_human_test
|
2 |
+
Qwen-Audio-Chat,32.2,37.8,47.800000000000004,51.4
|
3 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,51.4,53.2,64.80000000000001,67.2
|
4 |
+
hy_whisper_local_cs,,,,
|
5 |
+
Qwen2-Audio-7B-Instruct,42.0,39.6,51.6,53.6
|
6 |
+
whisper_large_v3,,,,
|
7 |
+
old_models,,,,
|
8 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,56.0,66.0,74.0,71.6
|
9 |
+
gemini-1.5-flash,,,,
|
10 |
+
WavLLM_fairseq,45.199999999999996,46.6,50.8,62.199999999999996
|
11 |
+
SALMONN_7B,40.599999999999994,36.6,44.6,46.8
|
12 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,49.0,53.8,57.800000000000004,64.0
|
results_organized_archive/llama3_70b_judge/under_development_llama3_70b_judge.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,ytb_sqa_batch1,ytb_sds_batch1,ytb_pqa_batch1
|
2 |
+
Qwen-Audio-Chat,60.827586206896555,43.878954607977995,37.16117216117216
|
3 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,64.51231527093596,53.97524071526823,40.97069597069597
|
4 |
+
hy_whisper_local_cs,60.137931034482754,60.9353507565337,45.78754578754578
|
5 |
+
Qwen2-Audio-7B-Instruct,60.453201970443345,51.5818431911967,36.97802197802198
|
6 |
+
whisper_large_v3,,,
|
7 |
+
old_models,,,
|
8 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,70.18719211822659,64.12654745529574,55.01831501831502
|
9 |
+
gemini-1.5-flash,78.06896551724138,65.9697386519945,49.908424908424905
|
10 |
+
WavLLM_fairseq,60.70935960591133,55.625859697386524,40.95238095238095
|
11 |
+
SALMONN_7B,55.665024630541865,31.279229711141674,32.124542124542124
|
12 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,67.3103448275862,59.44979367262724,52.252747252747255
|
results_organized_archive/meteor/audio_captioning.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,audiocaps_test,wavcaps_test
|
2 |
+
Qwen-Audio-Chat,0.27553015076950976,0.2355106805560457
|
3 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,0.24920047034353812,0.3175511907248581
|
4 |
+
hy_whisper_local_cs,,
|
5 |
+
Qwen2-Audio-7B-Instruct,0.19891712076314283,0.21342294856199182
|
6 |
+
whisper_large_v3,,
|
7 |
+
old_models,,
|
8 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.05796819723943051,0.120421856260385
|
9 |
+
gemini-1.5-flash,,
|
10 |
+
WavLLM_fairseq,0.041732965094428545,0.06399522524688675
|
11 |
+
SALMONN_7B,0.20994052484339956,0.17175112770658157
|
12 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,0.07953048457785493,0.1388630786594543
|
results_organized_archive/wer/asr_english.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,librispeech_test_clean,librispeech_test_other,common_voice_15_en_test,peoples_speech_test,gigaspeech_test,earnings21_test,earnings22_test,tedlium3_test,tedlium3_long_form_test
|
2 |
+
Qwen-Audio-Chat,0.020258799562379748,0.043467569561352074,0.11272421128398918,0.31419144746723354,0.13018910022587737,0.2655529121410546,0.3664994875132684,0.04052375714133636,0.2911540507002305
|
3 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,0.024333195005092994,0.04212457676811621,0.07789795695400416,0.21632867288683053,0.14468436081215577,0.1384587164122689,0.16563713100701868,0.08094105957914907,0.10501684098564085
|
4 |
+
hy_whisper_local_cs,0.02554042328441544,0.053417065466169825,0.1066766923091754,0.1991585778678581,0.0948233719154953,0.10871196540338629,0.1463228189913085,0.0467690997480572,0.05275660343910654
|
5 |
+
Qwen2-Audio-7B-Instruct,0.035141660693401744,0.060415760304159495,0.11438872500819404,0.2165498391593041,0.11723812890302816,0.18872219319407232,0.23542555661330924,0.06114048472375004,0.08739585179932637
|
6 |
+
whisper_large_v3,0.01878749009695552,0.03660128246354058,0.10001863741235596,0.14602420615337386,0.09459022434812692,0.11863959266711877,0.15887899737116104,0.037649480146197796,0.03208650948413402
|
7 |
+
old_models,,,,,,,,,
|
8 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.032349945297468596,0.05307658841999735,0.10600831614192711,0.20140159998943682,0.09948381629977261,0.11416493424197618,0.1448629161356777,0.04900464852205386,0.04396383619925545
|
9 |
+
gemini-1.5-flash,,,,,,,,,
|
10 |
+
WavLLM_fairseq,0.02103218017882069,0.04798834811886432,0.14533325621300636,0.3792176325635977,0.15491778414546403,0.6447482518259942,0.6671766188447099,0.06621482559171073,0.4536784258110264
|
11 |
+
SALMONN_7B,0.10270871845172973,0.09671439650443565,0.3062255383962828,0.23699946689025367,0.10765150204693537,0.2577708974886327,0.3597423676988383,0.0459884319222171,0.14231519234178336
|
12 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,0.018334779492209605,0.03714982881570734,0.09876543209876543,0.14540692118393275,0.09515429104337297,0.11773910240019567,0.15611126487402763,0.038146268762641496,0.04754476156709803
|
results_organized_archive/wer/asr_mandarin.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,aishell_asr_zh_test
|
2 |
+
Qwen-Audio-Chat,0.9469917443725129
|
3 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,0.12812060739244918
|
4 |
+
hy_whisper_local_cs,0.16361782582011838
|
5 |
+
Qwen2-Audio-7B-Instruct,0.09260359129694522
|
6 |
+
whisper_large_v3,0.12359684029221357
|
7 |
+
old_models,
|
8 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.20886539565639167
|
9 |
+
gemini-1.5-flash,
|
10 |
+
WavLLM_fairseq,0.7054601967888183
|
11 |
+
SALMONN_7B,0.8259290055631446
|
12 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,0.12450753301261111
|
results_organized_archive/wer/asr_singlish.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,imda_part1_asr_test,imda_part2_asr_test,imda_part3_30s_asr_test,imda_part4_30s_asr_test,imda_part5_30s_asr_test,imda_part6_30s_asr_test
|
2 |
+
Qwen-Audio-Chat,0.10550313315290274,0.45479263046830615,0.6412550574306894,1.173131813552289,0.3016882870525747,0.31394240863063033
|
3 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,0.042815692585277836,0.04719584449314179,0.2139462894072284,0.3002929748896629,0.15368227517473845,0.10833508293092589
|
4 |
+
hy_whisper_local_cs,0.06319947333772219,0.2719340962584206,0.23856138159502538,0.33742408429629445,0.16663991478309087,0.12873269917149824
|
5 |
+
Qwen2-Audio-7B-Instruct,0.07197717796796138,0.1905689473257041,0.35076166942732234,0.5613424034000176,0.27856006770658537,0.2245352799625317
|
6 |
+
whisper_large_v3,0.06844171360300393,0.3171008846684522,0.27026366524560785,0.4618189591218298,0.2143555471246589,0.1698509342851144
|
7 |
+
old_models,,,,,,
|
8 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.07041669714480775,0.32988393799204613,0.3035544573275043,0.4779640131272869,0.22881615619208825,0.1789273082575623
|
9 |
+
gemini-1.5-flash,,,,,,
|
10 |
+
WavLLM_fairseq,0.10077292565771828,0.4463923382842302,0.7540934640345399,1.143645714142011,0.39796588405247263,0.42541061709652933
|
11 |
+
SALMONN_7B,0.0925804013361617,0.42346400454508565,0.6569229098215983,0.7593582215292535,0.34868891450584405,0.24872817713464365
|
12 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,0.06922195401458074,0.31912994075156237,0.29992939962527493,0.4750971343786543,0.22004640235805695,0.17467982364056267
|
results_organized_archive/wer/under_development_wer.csv
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,cna_test,idpc_test,parliament_test,ukusnews_test,mediacorp_test,idpc_short_test,parliament_short_test,ukusnews_short_test,mediacorp_short_test,ytb_asr_batch1,ytb_asr_batch2,seame_dev_man,seame_dev_sge,ytb_asr_batch3_malay,ytb_asr_batch3_ms_ms_prompt,ytb_asr_batch3_chinese,ytb_asr_batch3_zh_zh_prompt
|
2 |
+
Qwen-Audio-Chat,0.19753284203780838,0.7710863986313088,0.26279685873781816,0.3158631121194933,0.4498529892192094,0.6008025988916491,0.09347360821020603,0.10399586086125925,0.2548909377108163,0.2297764461857571,0.4315277327278625,0.8783373786407767,1.05567969634822,2.8890790224211313,2.8990790224211313,,
|
3 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,0.15924383210509452,0.30008554319931563,0.058922319992430694,0.12554358101720553,0.170859196341065,0.24918784635964075,0.056935097083623425,0.10144869855926132,0.13301101866426804,0.11484981178458939,0.15162720294085846,0.388282092772384,0.35550521901496834,0.289500241,0.3031898556447721,0.29155272919978803,0.28269210386857446
|
4 |
+
hy_whisper_local_cs,0.15710776460536152,0.19863130881094954,0.058638471000094616,0.07199848742673473,0.13124795818360013,0.17638066118861073,0.06559913359634872,0.07828544137546764,0.1154711041151338,0.11546439271721595,0.22990593577684074,0.3134101941747573,0.33199669411368576,,,,
|
5 |
+
Qwen2-Audio-7B-Instruct,0.2067713339741536,0.19093242087254064,0.23270886555019396,0.13843826810361126,0.18694870957203527,0.21326199120963119,0.08416492612361723,0.1194380323171217,0.17180121430177647,0.16843358684796805,0.2080008649583739,0.5522518878101402,0.5486546879304539,0.9251458909218551,0.9981132903339037
|
6 |
+
whisper_large_v3,0.13841717398269784,0.19880239520958085,0.0753619074652285,0.07135564378899603,0.12054884024828487,0.1662526275558953,0.05543951935226013,0.06168908700151238,0.11715763436024286,0.12226319428439733,0.17210509244242622,0.7225930420711975,0.5377268970583734,0.237374402,0.237374402,0.21278219395866454,0.21278219395866454
|
7 |
+
whisper_large_v2,,,,,,,,,,,,,,,,0.2802967673555909,0.2802967673555909
|
8 |
+
old_models,,,,,,,,,,,,,,,,,
|
9 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.15171419416853574,0.16766467065868262,0.06282524363705176,0.07388920400831915,0.12455080039202875,0.16931014714313014,0.07325752301384698,0.06877338215394412,0.14571621317742298,0.1400092187139894,0.2192622950819672,0.7824973031283711,0.5840399155162387,,
|
10 |
+
gemini-1.5-flash,,,,,,,,,,0.1089344703080587,,0.9690871089536138,1.1100431601824359,,
|
11 |
+
WavLLM_fairseq,0.26946491509131687,0.7686911890504705,0.5216434856656259,0.5911892607298166,0.3595230316889905,0.36728454041658704,0.09512390087929656,0.2066783411605508,0.2621992354396222,0.41876008296842593,0.48091685587631094,1.2913969795037756,1.2204842511249197,,
|
12 |
+
SALMONN_7B,0.15395706504325538,0.4550898203592814,0.3010928186204939,0.18918510115333712,0.32089186540346293,0.26313777947639977,0.08676929424202573,0.09042426172092653,0.1751742747919946,0.21487285856956287,0.3238620391393664,1.2721817691477886,1.0189782362484312,,
|
13 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,0.13798996048275125,0.17741659538066723,0.07517267480367111,0.07642276422764227,0.13598497223129696,0.15803554366520162,0.05742502771975968,0.0700867627159118,0.11434675061839443,0.12579703464700007,0.23561466104443723,0.6848705501618123,0.507882090054792,,
|
14 |
+
Phi4-Multimodal-Instruct,,,,,,,,,,,,,,,,0.3390567037625861,0.21534711181770005
|