Upload organize_model_results.json with huggingface_hub
Browse files- organize_model_results.json +39 -0
organize_model_results.json
CHANGED
@@ -22,6 +22,7 @@
|
|
22 |
"whisper_large_v3": 0.8294532718704128,
|
23 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.4757667842702995,
|
24 |
"phi_4_multimodal_instruct": 1.3868687388941825,
|
|
|
25 |
"WavLLM_fairseq": 1.2058793232211378,
|
26 |
"SALMONN_7B": 0.7757204295537071,
|
27 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.4715562308464886
|
@@ -34,6 +35,7 @@
|
|
34 |
"Qwen2-Audio-7B-Instruct": 0.21342294856199182,
|
35 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.120421856260385,
|
36 |
"phi_4_multimodal_instruct": 0.24508284335582894,
|
|
|
37 |
"WavLLM_fairseq": 0.06399522524688675,
|
38 |
"SALMONN_7B": 0.17175112770658157,
|
39 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.1388630786594543
|
@@ -60,6 +62,7 @@
|
|
60 |
"whisper_large_v3": 14.673689493155793,
|
61 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 15.209998552437538,
|
62 |
"phi_4_multimodal_instruct": 22.678131781242936,
|
|
|
63 |
"WavLLM_fairseq": 2.368659001743569,
|
64 |
"SALMONN_7B": 5.296039450108202,
|
65 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 14.154700735606419
|
@@ -72,6 +75,7 @@
|
|
72 |
"Qwen2-Audio-7B-Instruct": 53.6,
|
73 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 71.6,
|
74 |
"phi_4_multimodal_instruct": 66.2,
|
|
|
75 |
"WavLLM_fairseq": 62.199999999999996,
|
76 |
"SALMONN_7B": 46.8,
|
77 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 64.0
|
@@ -111,6 +115,7 @@
|
|
111 |
"Qwen2-Audio-7B-Instruct": 71.60909856781802,
|
112 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 51.727042965459134,
|
113 |
"phi_4_multimodal_instruct": 54.422914911541696,
|
|
|
114 |
"WavLLM_fairseq": 44.3133951137321,
|
115 |
"SALMONN_7B": 50.88458298230834,
|
116 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 56.44481887110362
|
@@ -136,6 +141,7 @@
|
|
136 |
"whisper_large_v3": 0.3171008846684522,
|
137 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.32988393799204613,
|
138 |
"phi_4_multimodal_instruct": 0.3470091713334957,
|
|
|
139 |
"WavLLM_fairseq": 0.4463923382842302,
|
140 |
"SALMONN_7B": 0.42346400454508565,
|
141 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.31912994075156237
|
@@ -203,6 +209,7 @@
|
|
203 |
"whisper_large_v3": 0.1698509342851144,
|
204 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.1789273082575623,
|
205 |
"phi_4_multimodal_instruct": 0.14552883606001388,
|
|
|
206 |
"WavLLM_fairseq": 0.42541061709652933,
|
207 |
"SALMONN_7B": 0.24872817713464365,
|
208 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.17467982364056267
|
@@ -215,6 +222,7 @@
|
|
215 |
"Qwen2-Audio-7B-Instruct": 44.800000000000004,
|
216 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 72.2,
|
217 |
"phi_4_multimodal_instruct": 30.8,
|
|
|
218 |
"WavLLM_fairseq": 19.2,
|
219 |
"SALMONN_7B": 15.8,
|
220 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 63.0
|
@@ -242,6 +250,7 @@
|
|
242 |
"Qwen2-Audio-7B-Instruct": 58.31395348837209,
|
243 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 73.11046511627907,
|
244 |
"phi_4_multimodal_instruct": 68.40116279069767,
|
|
|
245 |
"WavLLM_fairseq": 58.54651162790698,
|
246 |
"SALMONN_7B": 59.24418604651163,
|
247 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 64.94186046511628
|
@@ -300,6 +309,7 @@
|
|
300 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.5840399155162387,
|
301 |
"gemini-1.5-flash": 1.1100431601824359,
|
302 |
"phi_4_multimodal_instruct": 0.8529492791331231,
|
|
|
303 |
"WavLLM_fairseq": 1.2204842511249197,
|
304 |
"SALMONN_7B": 1.0189782362484312,
|
305 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.507882090054792
|
@@ -312,6 +322,7 @@
|
|
312 |
"Qwen2-Audio-7B-Instruct": 53.9463601532567,
|
313 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 56.59003831417625,
|
314 |
"phi_4_multimodal_instruct": 51.609195402298845,
|
|
|
315 |
"WavLLM_fairseq": 51.072796934865906,
|
316 |
"SALMONN_7B": 41.7624521072797,
|
317 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 45.593869731800766
|
@@ -324,6 +335,7 @@
|
|
324 |
"Qwen2-Audio-7B-Instruct": 39.6,
|
325 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 66.0,
|
326 |
"phi_4_multimodal_instruct": 43.8,
|
|
|
327 |
"WavLLM_fairseq": 46.6,
|
328 |
"SALMONN_7B": 36.6,
|
329 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 53.8
|
@@ -351,6 +363,7 @@
|
|
351 |
"Qwen2-Audio-7B-Instruct": 61.56666666666667,
|
352 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 19.6,
|
353 |
"phi_4_multimodal_instruct": 36.833333333333336,
|
|
|
354 |
"WavLLM_fairseq": 46.766666666666666,
|
355 |
"SALMONN_7B": 42.733333333333334,
|
356 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 25.433333333333337
|
@@ -376,6 +389,7 @@
|
|
376 |
"whisper_large_v3": 0.2143555471246589,
|
377 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.22881615619208825,
|
378 |
"phi_4_multimodal_instruct": 0.22801359968481416,
|
|
|
379 |
"WavLLM_fairseq": 0.39796588405247263,
|
380 |
"SALMONN_7B": 0.34868891450584405,
|
381 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.22004640235805695
|
@@ -400,6 +414,7 @@
|
|
400 |
"whisper_large_v3": 0.15887899737116104,
|
401 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.1448629161356777,
|
402 |
"phi_4_multimodal_instruct": 0.24134627375003423,
|
|
|
403 |
"WavLLM_fairseq": 0.6671766188447099,
|
404 |
"SALMONN_7B": 0.3597423676988383,
|
405 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.15611126487402763
|
@@ -437,6 +452,7 @@
|
|
437 |
"whisper_large_v3": 46.01512198258627,
|
438 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 46.80524126004861,
|
439 |
"phi_4_multimodal_instruct": 0.36465303013961253,
|
|
|
440 |
"WavLLM_fairseq": 5.933522277713613,
|
441 |
"SALMONN_7B": 26.89649039333571,
|
442 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 46.79924664837527
|
@@ -474,6 +490,7 @@
|
|
474 |
"whisper_large_v3": 0.09459022434812692,
|
475 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.09948381629977261,
|
476 |
"phi_4_multimodal_instruct": 0.09672866386388193,
|
|
|
477 |
"WavLLM_fairseq": 0.15491778414546403,
|
478 |
"SALMONN_7B": 0.10765150204693537,
|
479 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.09515429104337297
|
@@ -498,6 +515,7 @@
|
|
498 |
"Qwen2-Audio-7B-Instruct": 33.8,
|
499 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 45.4,
|
500 |
"phi_4_multimodal_instruct": 41.2,
|
|
|
501 |
"WavLLM_fairseq": 31.6,
|
502 |
"SALMONN_7B": 9.0,
|
503 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 37.400000000000006
|
@@ -513,6 +531,7 @@
|
|
513 |
"Qwen2-Audio-7B-Instruct": 0.9666666666666667,
|
514 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 7.633333333333334,
|
515 |
"phi_4_multimodal_instruct": 0.5333333333333333,
|
|
|
516 |
"WavLLM_fairseq": 0.23333333333333336,
|
517 |
"SALMONN_7B": 0.06666666666666667,
|
518 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 9.666666666666666
|
@@ -525,6 +544,7 @@
|
|
525 |
"Qwen2-Audio-7B-Instruct": 92.80876494023903,
|
526 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 15.737051792828685,
|
527 |
"phi_4_multimodal_instruct": 59.46215139442231,
|
|
|
528 |
"WavLLM_fairseq": 51.932270916334666,
|
529 |
"SALMONN_7B": 81.31474103585658,
|
530 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 44.22310756972111
|
@@ -575,6 +595,7 @@
|
|
575 |
"Qwen2-Audio-7B-Instruct": 66.49242028227914,
|
576 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 89.33612127548353,
|
577 |
"phi_4_multimodal_instruct": 72.60846837428123,
|
|
|
578 |
"WavLLM_fairseq": 66.5446941975954,
|
579 |
"SALMONN_7B": 56.455828541557764,
|
580 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 86.4610559330894
|
@@ -587,6 +608,7 @@
|
|
587 |
"Qwen2-Audio-7B-Instruct": 40.4,
|
588 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 58.0,
|
589 |
"phi_4_multimodal_instruct": 52.199999999999996,
|
|
|
590 |
"WavLLM_fairseq": 45.199999999999996,
|
591 |
"SALMONN_7B": 17.2,
|
592 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 49.0
|
@@ -616,6 +638,7 @@
|
|
616 |
"Qwen2-Audio-7B-Instruct": 42.0,
|
617 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 56.0,
|
618 |
"phi_4_multimodal_instruct": 43.8,
|
|
|
619 |
"WavLLM_fairseq": 45.199999999999996,
|
620 |
"SALMONN_7B": 40.599999999999994,
|
621 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 49.0
|
@@ -631,6 +654,7 @@
|
|
631 |
"Qwen2-Audio-7B-Instruct": 24.8,
|
632 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 44.0,
|
633 |
"phi_4_multimodal_instruct": 37.0,
|
|
|
634 |
"WavLLM_fairseq": 31.6,
|
635 |
"SALMONN_7B": 7.0,
|
636 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 36.0
|
@@ -668,6 +692,7 @@
|
|
668 |
"Qwen2-Audio-7B-Instruct": 0.19891712076314283,
|
669 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.05796819723943051,
|
670 |
"phi_4_multimodal_instruct": 0.1757379026471828,
|
|
|
671 |
"WavLLM_fairseq": 0.041732965094428545,
|
672 |
"SALMONN_7B": 0.20994052484339956,
|
673 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.07953048457785493
|
@@ -683,6 +708,7 @@
|
|
683 |
"Qwen2-Audio-7B-Instruct": 2.55,
|
684 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 26.016666666666666,
|
685 |
"phi_4_multimodal_instruct": 3.5166666666666666,
|
|
|
686 |
"WavLLM_fairseq": 2.6833333333333336,
|
687 |
"SALMONN_7B": 2.5166666666666666,
|
688 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 12.416666666666666
|
@@ -708,6 +734,7 @@
|
|
708 |
"whisper_large_v3": 2.451098639578599,
|
709 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 2.8327095799289337,
|
710 |
"phi_4_multimodal_instruct": 0.053138495633157125,
|
|
|
711 |
"WavLLM_fairseq": 0.1695522548322915,
|
712 |
"SALMONN_7B": 0.3649023706010388,
|
713 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 2.4245628096245917
|
@@ -733,6 +760,7 @@
|
|
733 |
"Qwen2-Audio-7B-Instruct": 50.919591292758774,
|
734 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 24.647544968400585,
|
735 |
"phi_4_multimodal_instruct": 47.86582401555663,
|
|
|
736 |
"WavLLM_fairseq": 43.01199466903598,
|
737 |
"SALMONN_7B": 57.75401069518716,
|
738 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 29.47134606841404
|
@@ -763,6 +791,7 @@
|
|
763 |
"whisper_large_v3": 0.27026366524560785,
|
764 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.3035544573275043,
|
765 |
"phi_4_multimodal_instruct": 0.44227061666711925,
|
|
|
766 |
"WavLLM_fairseq": 0.7540934640345399,
|
767 |
"SALMONN_7B": 0.6569229098215983,
|
768 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.29992939962527493
|
@@ -813,6 +842,7 @@
|
|
813 |
"Qwen2-Audio-7B-Instruct": 45.75079872204473,
|
814 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 18.466453674121407,
|
815 |
"phi_4_multimodal_instruct": 38.466453674121404,
|
|
|
816 |
"WavLLM_fairseq": 29.840255591054312,
|
817 |
"SALMONN_7B": 50.287539936102235,
|
818 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 17.380191693290733
|
@@ -829,6 +859,7 @@
|
|
829 |
"whisper_large_v3": 0.06844171360300393,
|
830 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.07041669714480775,
|
831 |
"phi_4_multimodal_instruct": 0.05739643527661961,
|
|
|
832 |
"WavLLM_fairseq": 0.10077292565771828,
|
833 |
"SALMONN_7B": 0.0925804013361617,
|
834 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.06922195401458074
|
@@ -841,6 +872,7 @@
|
|
841 |
"Qwen2-Audio-7B-Instruct": 0.2165498391593041,
|
842 |
"whisper_large_v3": 0.14602420615337386,
|
843 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.20140159998943682,
|
|
|
844 |
"WavLLM_fairseq": 0.3792176325635977,
|
845 |
"SALMONN_7B": 0.23699946689025367,
|
846 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.14540692118393275
|
@@ -866,6 +898,7 @@
|
|
866 |
"Qwen2-Audio-7B-Instruct": 44.473684210526315,
|
867 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 18.88157894736842,
|
868 |
"phi_4_multimodal_instruct": 35.13157894736842,
|
|
|
869 |
"WavLLM_fairseq": 26.25,
|
870 |
"SALMONN_7B": 47.30263157894737,
|
871 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 16.710526315789473
|
@@ -1014,6 +1047,7 @@
|
|
1014 |
"Qwen2-Audio-7B-Instruct": 68.38333333333333,
|
1015 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 26.35,
|
1016 |
"phi_4_multimodal_instruct": 51.68333333333334,
|
|
|
1017 |
"WavLLM_fairseq": 49.06666666666666,
|
1018 |
"SALMONN_7B": 59.766666666666666,
|
1019 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 36.016666666666666
|
@@ -1026,6 +1060,7 @@
|
|
1026 |
"Qwen2-Audio-7B-Instruct": 80.04901960784315,
|
1027 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 88.57843137254902,
|
1028 |
"phi_4_multimodal_instruct": 88.33333333333334,
|
|
|
1029 |
"WavLLM_fairseq": 83.92156862745098,
|
1030 |
"SALMONN_7B": 83.48039215686273,
|
1031 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 82.99019607843137
|
@@ -1055,6 +1090,7 @@
|
|
1055 |
"Qwen2-Audio-7B-Instruct": 41.60919540229885,
|
1056 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 47.356321839080465,
|
1057 |
"phi_4_multimodal_instruct": 43.524904214559385,
|
|
|
1058 |
"WavLLM_fairseq": 41.57088122605364,
|
1059 |
"SALMONN_7B": 30.536398467432953,
|
1060 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 36.81992337164751
|
@@ -1083,6 +1119,7 @@
|
|
1083 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.7824973031283711,
|
1084 |
"gemini-1.5-flash": 0.9690871089536138,
|
1085 |
"phi_4_multimodal_instruct": 0.7126483279395901,
|
|
|
1086 |
"WavLLM_fairseq": 1.2913969795037756,
|
1087 |
"SALMONN_7B": 1.2721817691477886,
|
1088 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.6848705501618123
|
@@ -1107,6 +1144,7 @@
|
|
1107 |
"Qwen2-Audio-7B-Instruct": 51.6,
|
1108 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 74.0,
|
1109 |
"phi_4_multimodal_instruct": 49.0,
|
|
|
1110 |
"WavLLM_fairseq": 50.8,
|
1111 |
"SALMONN_7B": 44.6,
|
1112 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 57.800000000000004
|
@@ -1122,6 +1160,7 @@
|
|
1122 |
"Qwen2-Audio-7B-Instruct": 46.2,
|
1123 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 65.4,
|
1124 |
"phi_4_multimodal_instruct": 52.599999999999994,
|
|
|
1125 |
"WavLLM_fairseq": 49.400000000000006,
|
1126 |
"SALMONN_7B": 24.2,
|
1127 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 57.199999999999996
|
|
|
22 |
"whisper_large_v3": 0.8294532718704128,
|
23 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.4757667842702995,
|
24 |
"phi_4_multimodal_instruct": 1.3868687388941825,
|
25 |
+
"seallms_audio_7b": 1.8960881769720068,
|
26 |
"WavLLM_fairseq": 1.2058793232211378,
|
27 |
"SALMONN_7B": 0.7757204295537071,
|
28 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.4715562308464886
|
|
|
35 |
"Qwen2-Audio-7B-Instruct": 0.21342294856199182,
|
36 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.120421856260385,
|
37 |
"phi_4_multimodal_instruct": 0.24508284335582894,
|
38 |
+
"seallms_audio_7b": 0.1444387454989207,
|
39 |
"WavLLM_fairseq": 0.06399522524688675,
|
40 |
"SALMONN_7B": 0.17175112770658157,
|
41 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.1388630786594543
|
|
|
62 |
"whisper_large_v3": 14.673689493155793,
|
63 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 15.209998552437538,
|
64 |
"phi_4_multimodal_instruct": 22.678131781242936,
|
65 |
+
"seallms_audio_7b": 18.79451062979056,
|
66 |
"WavLLM_fairseq": 2.368659001743569,
|
67 |
"SALMONN_7B": 5.296039450108202,
|
68 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 14.154700735606419
|
|
|
75 |
"Qwen2-Audio-7B-Instruct": 53.6,
|
76 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 71.6,
|
77 |
"phi_4_multimodal_instruct": 66.2,
|
78 |
+
"seallms_audio_7b": 58.2,
|
79 |
"WavLLM_fairseq": 62.199999999999996,
|
80 |
"SALMONN_7B": 46.8,
|
81 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 64.0
|
|
|
115 |
"Qwen2-Audio-7B-Instruct": 71.60909856781802,
|
116 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 51.727042965459134,
|
117 |
"phi_4_multimodal_instruct": 54.422914911541696,
|
118 |
+
"seallms_audio_7b": 63.184498736310026,
|
119 |
"WavLLM_fairseq": 44.3133951137321,
|
120 |
"SALMONN_7B": 50.88458298230834,
|
121 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 56.44481887110362
|
|
|
141 |
"whisper_large_v3": 0.3171008846684522,
|
142 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.32988393799204613,
|
143 |
"phi_4_multimodal_instruct": 0.3470091713334957,
|
144 |
+
"seallms_audio_7b": 0.290236182128074,
|
145 |
"WavLLM_fairseq": 0.4463923382842302,
|
146 |
"SALMONN_7B": 0.42346400454508565,
|
147 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.31912994075156237
|
|
|
209 |
"whisper_large_v3": 0.1698509342851144,
|
210 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.1789273082575623,
|
211 |
"phi_4_multimodal_instruct": 0.14552883606001388,
|
212 |
+
"seallms_audio_7b": 0.6259629515980555,
|
213 |
"WavLLM_fairseq": 0.42541061709652933,
|
214 |
"SALMONN_7B": 0.24872817713464365,
|
215 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.17467982364056267
|
|
|
222 |
"Qwen2-Audio-7B-Instruct": 44.800000000000004,
|
223 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 72.2,
|
224 |
"phi_4_multimodal_instruct": 30.8,
|
225 |
+
"seallms_audio_7b": 63.8,
|
226 |
"WavLLM_fairseq": 19.2,
|
227 |
"SALMONN_7B": 15.8,
|
228 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 63.0
|
|
|
250 |
"Qwen2-Audio-7B-Instruct": 58.31395348837209,
|
251 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 73.11046511627907,
|
252 |
"phi_4_multimodal_instruct": 68.40116279069767,
|
253 |
+
"seallms_audio_7b": 57.587209302325576,
|
254 |
"WavLLM_fairseq": 58.54651162790698,
|
255 |
"SALMONN_7B": 59.24418604651163,
|
256 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 64.94186046511628
|
|
|
309 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.5840399155162387,
|
310 |
"gemini-1.5-flash": 1.1100431601824359,
|
311 |
"phi_4_multimodal_instruct": 0.8529492791331231,
|
312 |
+
"seallms_audio_7b": 1.7106737273868193,
|
313 |
"WavLLM_fairseq": 1.2204842511249197,
|
314 |
"SALMONN_7B": 1.0189782362484312,
|
315 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.507882090054792
|
|
|
322 |
"Qwen2-Audio-7B-Instruct": 53.9463601532567,
|
323 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 56.59003831417625,
|
324 |
"phi_4_multimodal_instruct": 51.609195402298845,
|
325 |
+
"seallms_audio_7b": 52.1455938697318,
|
326 |
"WavLLM_fairseq": 51.072796934865906,
|
327 |
"SALMONN_7B": 41.7624521072797,
|
328 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 45.593869731800766
|
|
|
335 |
"Qwen2-Audio-7B-Instruct": 39.6,
|
336 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 66.0,
|
337 |
"phi_4_multimodal_instruct": 43.8,
|
338 |
+
"seallms_audio_7b": 45.0,
|
339 |
"WavLLM_fairseq": 46.6,
|
340 |
"SALMONN_7B": 36.6,
|
341 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 53.8
|
|
|
363 |
"Qwen2-Audio-7B-Instruct": 61.56666666666667,
|
364 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 19.6,
|
365 |
"phi_4_multimodal_instruct": 36.833333333333336,
|
366 |
+
"seallms_audio_7b": 30.5,
|
367 |
"WavLLM_fairseq": 46.766666666666666,
|
368 |
"SALMONN_7B": 42.733333333333334,
|
369 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 25.433333333333337
|
|
|
389 |
"whisper_large_v3": 0.2143555471246589,
|
390 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.22881615619208825,
|
391 |
"phi_4_multimodal_instruct": 0.22801359968481416,
|
392 |
+
"seallms_audio_7b": 0.5812260145043848,
|
393 |
"WavLLM_fairseq": 0.39796588405247263,
|
394 |
"SALMONN_7B": 0.34868891450584405,
|
395 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.22004640235805695
|
|
|
414 |
"whisper_large_v3": 0.15887899737116104,
|
415 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.1448629161356777,
|
416 |
"phi_4_multimodal_instruct": 0.24134627375003423,
|
417 |
+
"seallms_audio_7b": 0.5738685499413504,
|
418 |
"WavLLM_fairseq": 0.6671766188447099,
|
419 |
"SALMONN_7B": 0.3597423676988383,
|
420 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.15611126487402763
|
|
|
452 |
"whisper_large_v3": 46.01512198258627,
|
453 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 46.80524126004861,
|
454 |
"phi_4_multimodal_instruct": 0.36465303013961253,
|
455 |
+
"seallms_audio_7b": 43.98074943006231,
|
456 |
"WavLLM_fairseq": 5.933522277713613,
|
457 |
"SALMONN_7B": 26.89649039333571,
|
458 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 46.79924664837527
|
|
|
490 |
"whisper_large_v3": 0.09459022434812692,
|
491 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.09948381629977261,
|
492 |
"phi_4_multimodal_instruct": 0.09672866386388193,
|
493 |
+
"seallms_audio_7b": 0.13672725996455154,
|
494 |
"WavLLM_fairseq": 0.15491778414546403,
|
495 |
"SALMONN_7B": 0.10765150204693537,
|
496 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.09515429104337297
|
|
|
515 |
"Qwen2-Audio-7B-Instruct": 33.8,
|
516 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 45.4,
|
517 |
"phi_4_multimodal_instruct": 41.2,
|
518 |
+
"seallms_audio_7b": 43.0,
|
519 |
"WavLLM_fairseq": 31.6,
|
520 |
"SALMONN_7B": 9.0,
|
521 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 37.400000000000006
|
|
|
531 |
"Qwen2-Audio-7B-Instruct": 0.9666666666666667,
|
532 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 7.633333333333334,
|
533 |
"phi_4_multimodal_instruct": 0.5333333333333333,
|
534 |
+
"seallms_audio_7b": 15.633333333333333,
|
535 |
"WavLLM_fairseq": 0.23333333333333336,
|
536 |
"SALMONN_7B": 0.06666666666666667,
|
537 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 9.666666666666666
|
|
|
544 |
"Qwen2-Audio-7B-Instruct": 92.80876494023903,
|
545 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 15.737051792828685,
|
546 |
"phi_4_multimodal_instruct": 59.46215139442231,
|
547 |
+
"seallms_audio_7b": 66.43426294820716,
|
548 |
"WavLLM_fairseq": 51.932270916334666,
|
549 |
"SALMONN_7B": 81.31474103585658,
|
550 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 44.22310756972111
|
|
|
595 |
"Qwen2-Audio-7B-Instruct": 66.49242028227914,
|
596 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 89.33612127548353,
|
597 |
"phi_4_multimodal_instruct": 72.60846837428123,
|
598 |
+
"seallms_audio_7b": 75.6926293779404,
|
599 |
"WavLLM_fairseq": 66.5446941975954,
|
600 |
"SALMONN_7B": 56.455828541557764,
|
601 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 86.4610559330894
|
|
|
608 |
"Qwen2-Audio-7B-Instruct": 40.4,
|
609 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 58.0,
|
610 |
"phi_4_multimodal_instruct": 52.199999999999996,
|
611 |
+
"seallms_audio_7b": 49.400000000000006,
|
612 |
"WavLLM_fairseq": 45.199999999999996,
|
613 |
"SALMONN_7B": 17.2,
|
614 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 49.0
|
|
|
638 |
"Qwen2-Audio-7B-Instruct": 42.0,
|
639 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 56.0,
|
640 |
"phi_4_multimodal_instruct": 43.8,
|
641 |
+
"seallms_audio_7b": 45.599999999999994,
|
642 |
"WavLLM_fairseq": 45.199999999999996,
|
643 |
"SALMONN_7B": 40.599999999999994,
|
644 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 49.0
|
|
|
654 |
"Qwen2-Audio-7B-Instruct": 24.8,
|
655 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 44.0,
|
656 |
"phi_4_multimodal_instruct": 37.0,
|
657 |
+
"seallms_audio_7b": 35.4,
|
658 |
"WavLLM_fairseq": 31.6,
|
659 |
"SALMONN_7B": 7.0,
|
660 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 36.0
|
|
|
692 |
"Qwen2-Audio-7B-Instruct": 0.19891712076314283,
|
693 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.05796819723943051,
|
694 |
"phi_4_multimodal_instruct": 0.1757379026471828,
|
695 |
+
"seallms_audio_7b": 0.30423899385222564,
|
696 |
"WavLLM_fairseq": 0.041732965094428545,
|
697 |
"SALMONN_7B": 0.20994052484339956,
|
698 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.07953048457785493
|
|
|
708 |
"Qwen2-Audio-7B-Instruct": 2.55,
|
709 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 26.016666666666666,
|
710 |
"phi_4_multimodal_instruct": 3.5166666666666666,
|
711 |
+
"seallms_audio_7b": 3.5833333333333335,
|
712 |
"WavLLM_fairseq": 2.6833333333333336,
|
713 |
"SALMONN_7B": 2.5166666666666666,
|
714 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 12.416666666666666
|
|
|
734 |
"whisper_large_v3": 2.451098639578599,
|
735 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 2.8327095799289337,
|
736 |
"phi_4_multimodal_instruct": 0.053138495633157125,
|
737 |
+
"seallms_audio_7b": 0.06475917031217593,
|
738 |
"WavLLM_fairseq": 0.1695522548322915,
|
739 |
"SALMONN_7B": 0.3649023706010388,
|
740 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 2.4245628096245917
|
|
|
760 |
"Qwen2-Audio-7B-Instruct": 50.919591292758774,
|
761 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 24.647544968400585,
|
762 |
"phi_4_multimodal_instruct": 47.86582401555663,
|
763 |
+
"seallms_audio_7b": 53.03840544482256,
|
764 |
"WavLLM_fairseq": 43.01199466903598,
|
765 |
"SALMONN_7B": 57.75401069518716,
|
766 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 29.47134606841404
|
|
|
791 |
"whisper_large_v3": 0.27026366524560785,
|
792 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.3035544573275043,
|
793 |
"phi_4_multimodal_instruct": 0.44227061666711925,
|
794 |
+
"seallms_audio_7b": 1.0837293290249002,
|
795 |
"WavLLM_fairseq": 0.7540934640345399,
|
796 |
"SALMONN_7B": 0.6569229098215983,
|
797 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.29992939962527493
|
|
|
842 |
"Qwen2-Audio-7B-Instruct": 45.75079872204473,
|
843 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 18.466453674121407,
|
844 |
"phi_4_multimodal_instruct": 38.466453674121404,
|
845 |
+
"seallms_audio_7b": 53.73801916932908,
|
846 |
"WavLLM_fairseq": 29.840255591054312,
|
847 |
"SALMONN_7B": 50.287539936102235,
|
848 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 17.380191693290733
|
|
|
859 |
"whisper_large_v3": 0.06844171360300393,
|
860 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.07041669714480775,
|
861 |
"phi_4_multimodal_instruct": 0.05739643527661961,
|
862 |
+
"seallms_audio_7b": 0.17813863896813206,
|
863 |
"WavLLM_fairseq": 0.10077292565771828,
|
864 |
"SALMONN_7B": 0.0925804013361617,
|
865 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.06922195401458074
|
|
|
872 |
"Qwen2-Audio-7B-Instruct": 0.2165498391593041,
|
873 |
"whisper_large_v3": 0.14602420615337386,
|
874 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.20140159998943682,
|
875 |
+
"seallms_audio_7b": 0.369768551146351,
|
876 |
"WavLLM_fairseq": 0.3792176325635977,
|
877 |
"SALMONN_7B": 0.23699946689025367,
|
878 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.14540692118393275
|
|
|
898 |
"Qwen2-Audio-7B-Instruct": 44.473684210526315,
|
899 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 18.88157894736842,
|
900 |
"phi_4_multimodal_instruct": 35.13157894736842,
|
901 |
+
"seallms_audio_7b": 42.10526315789473,
|
902 |
"WavLLM_fairseq": 26.25,
|
903 |
"SALMONN_7B": 47.30263157894737,
|
904 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 16.710526315789473
|
|
|
1047 |
"Qwen2-Audio-7B-Instruct": 68.38333333333333,
|
1048 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 26.35,
|
1049 |
"phi_4_multimodal_instruct": 51.68333333333334,
|
1050 |
+
"seallms_audio_7b": 50.083333333333336,
|
1051 |
"WavLLM_fairseq": 49.06666666666666,
|
1052 |
"SALMONN_7B": 59.766666666666666,
|
1053 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 36.016666666666666
|
|
|
1060 |
"Qwen2-Audio-7B-Instruct": 80.04901960784315,
|
1061 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 88.57843137254902,
|
1062 |
"phi_4_multimodal_instruct": 88.33333333333334,
|
1063 |
+
"seallms_audio_7b": 83.52941176470588,
|
1064 |
"WavLLM_fairseq": 83.92156862745098,
|
1065 |
"SALMONN_7B": 83.48039215686273,
|
1066 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 82.99019607843137
|
|
|
1090 |
"Qwen2-Audio-7B-Instruct": 41.60919540229885,
|
1091 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 47.356321839080465,
|
1092 |
"phi_4_multimodal_instruct": 43.524904214559385,
|
1093 |
+
"seallms_audio_7b": 51.11111111111111,
|
1094 |
"WavLLM_fairseq": 41.57088122605364,
|
1095 |
"SALMONN_7B": 30.536398467432953,
|
1096 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 36.81992337164751
|
|
|
1119 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.7824973031283711,
|
1120 |
"gemini-1.5-flash": 0.9690871089536138,
|
1121 |
"phi_4_multimodal_instruct": 0.7126483279395901,
|
1122 |
+
"seallms_audio_7b": 1.0639495685005393,
|
1123 |
"WavLLM_fairseq": 1.2913969795037756,
|
1124 |
"SALMONN_7B": 1.2721817691477886,
|
1125 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.6848705501618123
|
|
|
1144 |
"Qwen2-Audio-7B-Instruct": 51.6,
|
1145 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 74.0,
|
1146 |
"phi_4_multimodal_instruct": 49.0,
|
1147 |
+
"seallms_audio_7b": 54.2,
|
1148 |
"WavLLM_fairseq": 50.8,
|
1149 |
"SALMONN_7B": 44.6,
|
1150 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 57.800000000000004
|
|
|
1160 |
"Qwen2-Audio-7B-Instruct": 46.2,
|
1161 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 65.4,
|
1162 |
"phi_4_multimodal_instruct": 52.599999999999994,
|
1163 |
+
"seallms_audio_7b": 53.0,
|
1164 |
"WavLLM_fairseq": 49.400000000000006,
|
1165 |
"SALMONN_7B": 24.2,
|
1166 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 57.199999999999996
|