binwang commited on
Commit
840cba2
·
verified ·
1 Parent(s): c2ca8a9

Upload organize_model_results.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. organize_model_results.json +39 -0
organize_model_results.json CHANGED
@@ -22,6 +22,7 @@
22
  "whisper_large_v3": 0.8294532718704128,
23
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.4757667842702995,
24
  "phi_4_multimodal_instruct": 1.3868687388941825,
 
25
  "WavLLM_fairseq": 1.2058793232211378,
26
  "SALMONN_7B": 0.7757204295537071,
27
  "cascade_whisper_large_v3_llama_3_8b_instruct": 0.4715562308464886
@@ -34,6 +35,7 @@
34
  "Qwen2-Audio-7B-Instruct": 0.21342294856199182,
35
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.120421856260385,
36
  "phi_4_multimodal_instruct": 0.24508284335582894,
 
37
  "WavLLM_fairseq": 0.06399522524688675,
38
  "SALMONN_7B": 0.17175112770658157,
39
  "cascade_whisper_large_v3_llama_3_8b_instruct": 0.1388630786594543
@@ -60,6 +62,7 @@
60
  "whisper_large_v3": 14.673689493155793,
61
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 15.209998552437538,
62
  "phi_4_multimodal_instruct": 22.678131781242936,
 
63
  "WavLLM_fairseq": 2.368659001743569,
64
  "SALMONN_7B": 5.296039450108202,
65
  "cascade_whisper_large_v3_llama_3_8b_instruct": 14.154700735606419
@@ -72,6 +75,7 @@
72
  "Qwen2-Audio-7B-Instruct": 53.6,
73
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 71.6,
74
  "phi_4_multimodal_instruct": 66.2,
 
75
  "WavLLM_fairseq": 62.199999999999996,
76
  "SALMONN_7B": 46.8,
77
  "cascade_whisper_large_v3_llama_3_8b_instruct": 64.0
@@ -111,6 +115,7 @@
111
  "Qwen2-Audio-7B-Instruct": 71.60909856781802,
112
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 51.727042965459134,
113
  "phi_4_multimodal_instruct": 54.422914911541696,
 
114
  "WavLLM_fairseq": 44.3133951137321,
115
  "SALMONN_7B": 50.88458298230834,
116
  "cascade_whisper_large_v3_llama_3_8b_instruct": 56.44481887110362
@@ -136,6 +141,7 @@
136
  "whisper_large_v3": 0.3171008846684522,
137
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.32988393799204613,
138
  "phi_4_multimodal_instruct": 0.3470091713334957,
 
139
  "WavLLM_fairseq": 0.4463923382842302,
140
  "SALMONN_7B": 0.42346400454508565,
141
  "cascade_whisper_large_v3_llama_3_8b_instruct": 0.31912994075156237
@@ -203,6 +209,7 @@
203
  "whisper_large_v3": 0.1698509342851144,
204
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.1789273082575623,
205
  "phi_4_multimodal_instruct": 0.14552883606001388,
 
206
  "WavLLM_fairseq": 0.42541061709652933,
207
  "SALMONN_7B": 0.24872817713464365,
208
  "cascade_whisper_large_v3_llama_3_8b_instruct": 0.17467982364056267
@@ -215,6 +222,7 @@
215
  "Qwen2-Audio-7B-Instruct": 44.800000000000004,
216
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 72.2,
217
  "phi_4_multimodal_instruct": 30.8,
 
218
  "WavLLM_fairseq": 19.2,
219
  "SALMONN_7B": 15.8,
220
  "cascade_whisper_large_v3_llama_3_8b_instruct": 63.0
@@ -242,6 +250,7 @@
242
  "Qwen2-Audio-7B-Instruct": 58.31395348837209,
243
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 73.11046511627907,
244
  "phi_4_multimodal_instruct": 68.40116279069767,
 
245
  "WavLLM_fairseq": 58.54651162790698,
246
  "SALMONN_7B": 59.24418604651163,
247
  "cascade_whisper_large_v3_llama_3_8b_instruct": 64.94186046511628
@@ -300,6 +309,7 @@
300
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.5840399155162387,
301
  "gemini-1.5-flash": 1.1100431601824359,
302
  "phi_4_multimodal_instruct": 0.8529492791331231,
 
303
  "WavLLM_fairseq": 1.2204842511249197,
304
  "SALMONN_7B": 1.0189782362484312,
305
  "cascade_whisper_large_v3_llama_3_8b_instruct": 0.507882090054792
@@ -312,6 +322,7 @@
312
  "Qwen2-Audio-7B-Instruct": 53.9463601532567,
313
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 56.59003831417625,
314
  "phi_4_multimodal_instruct": 51.609195402298845,
 
315
  "WavLLM_fairseq": 51.072796934865906,
316
  "SALMONN_7B": 41.7624521072797,
317
  "cascade_whisper_large_v3_llama_3_8b_instruct": 45.593869731800766
@@ -324,6 +335,7 @@
324
  "Qwen2-Audio-7B-Instruct": 39.6,
325
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 66.0,
326
  "phi_4_multimodal_instruct": 43.8,
 
327
  "WavLLM_fairseq": 46.6,
328
  "SALMONN_7B": 36.6,
329
  "cascade_whisper_large_v3_llama_3_8b_instruct": 53.8
@@ -351,6 +363,7 @@
351
  "Qwen2-Audio-7B-Instruct": 61.56666666666667,
352
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 19.6,
353
  "phi_4_multimodal_instruct": 36.833333333333336,
 
354
  "WavLLM_fairseq": 46.766666666666666,
355
  "SALMONN_7B": 42.733333333333334,
356
  "cascade_whisper_large_v3_llama_3_8b_instruct": 25.433333333333337
@@ -376,6 +389,7 @@
376
  "whisper_large_v3": 0.2143555471246589,
377
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.22881615619208825,
378
  "phi_4_multimodal_instruct": 0.22801359968481416,
 
379
  "WavLLM_fairseq": 0.39796588405247263,
380
  "SALMONN_7B": 0.34868891450584405,
381
  "cascade_whisper_large_v3_llama_3_8b_instruct": 0.22004640235805695
@@ -400,6 +414,7 @@
400
  "whisper_large_v3": 0.15887899737116104,
401
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.1448629161356777,
402
  "phi_4_multimodal_instruct": 0.24134627375003423,
 
403
  "WavLLM_fairseq": 0.6671766188447099,
404
  "SALMONN_7B": 0.3597423676988383,
405
  "cascade_whisper_large_v3_llama_3_8b_instruct": 0.15611126487402763
@@ -437,6 +452,7 @@
437
  "whisper_large_v3": 46.01512198258627,
438
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 46.80524126004861,
439
  "phi_4_multimodal_instruct": 0.36465303013961253,
 
440
  "WavLLM_fairseq": 5.933522277713613,
441
  "SALMONN_7B": 26.89649039333571,
442
  "cascade_whisper_large_v3_llama_3_8b_instruct": 46.79924664837527
@@ -474,6 +490,7 @@
474
  "whisper_large_v3": 0.09459022434812692,
475
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.09948381629977261,
476
  "phi_4_multimodal_instruct": 0.09672866386388193,
 
477
  "WavLLM_fairseq": 0.15491778414546403,
478
  "SALMONN_7B": 0.10765150204693537,
479
  "cascade_whisper_large_v3_llama_3_8b_instruct": 0.09515429104337297
@@ -498,6 +515,7 @@
498
  "Qwen2-Audio-7B-Instruct": 33.8,
499
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 45.4,
500
  "phi_4_multimodal_instruct": 41.2,
 
501
  "WavLLM_fairseq": 31.6,
502
  "SALMONN_7B": 9.0,
503
  "cascade_whisper_large_v3_llama_3_8b_instruct": 37.400000000000006
@@ -513,6 +531,7 @@
513
  "Qwen2-Audio-7B-Instruct": 0.9666666666666667,
514
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 7.633333333333334,
515
  "phi_4_multimodal_instruct": 0.5333333333333333,
 
516
  "WavLLM_fairseq": 0.23333333333333336,
517
  "SALMONN_7B": 0.06666666666666667,
518
  "cascade_whisper_large_v3_llama_3_8b_instruct": 9.666666666666666
@@ -525,6 +544,7 @@
525
  "Qwen2-Audio-7B-Instruct": 92.80876494023903,
526
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 15.737051792828685,
527
  "phi_4_multimodal_instruct": 59.46215139442231,
 
528
  "WavLLM_fairseq": 51.932270916334666,
529
  "SALMONN_7B": 81.31474103585658,
530
  "cascade_whisper_large_v3_llama_3_8b_instruct": 44.22310756972111
@@ -575,6 +595,7 @@
575
  "Qwen2-Audio-7B-Instruct": 66.49242028227914,
576
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 89.33612127548353,
577
  "phi_4_multimodal_instruct": 72.60846837428123,
 
578
  "WavLLM_fairseq": 66.5446941975954,
579
  "SALMONN_7B": 56.455828541557764,
580
  "cascade_whisper_large_v3_llama_3_8b_instruct": 86.4610559330894
@@ -587,6 +608,7 @@
587
  "Qwen2-Audio-7B-Instruct": 40.4,
588
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 58.0,
589
  "phi_4_multimodal_instruct": 52.199999999999996,
 
590
  "WavLLM_fairseq": 45.199999999999996,
591
  "SALMONN_7B": 17.2,
592
  "cascade_whisper_large_v3_llama_3_8b_instruct": 49.0
@@ -616,6 +638,7 @@
616
  "Qwen2-Audio-7B-Instruct": 42.0,
617
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 56.0,
618
  "phi_4_multimodal_instruct": 43.8,
 
619
  "WavLLM_fairseq": 45.199999999999996,
620
  "SALMONN_7B": 40.599999999999994,
621
  "cascade_whisper_large_v3_llama_3_8b_instruct": 49.0
@@ -631,6 +654,7 @@
631
  "Qwen2-Audio-7B-Instruct": 24.8,
632
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 44.0,
633
  "phi_4_multimodal_instruct": 37.0,
 
634
  "WavLLM_fairseq": 31.6,
635
  "SALMONN_7B": 7.0,
636
  "cascade_whisper_large_v3_llama_3_8b_instruct": 36.0
@@ -668,6 +692,7 @@
668
  "Qwen2-Audio-7B-Instruct": 0.19891712076314283,
669
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.05796819723943051,
670
  "phi_4_multimodal_instruct": 0.1757379026471828,
 
671
  "WavLLM_fairseq": 0.041732965094428545,
672
  "SALMONN_7B": 0.20994052484339956,
673
  "cascade_whisper_large_v3_llama_3_8b_instruct": 0.07953048457785493
@@ -683,6 +708,7 @@
683
  "Qwen2-Audio-7B-Instruct": 2.55,
684
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 26.016666666666666,
685
  "phi_4_multimodal_instruct": 3.5166666666666666,
 
686
  "WavLLM_fairseq": 2.6833333333333336,
687
  "SALMONN_7B": 2.5166666666666666,
688
  "cascade_whisper_large_v3_llama_3_8b_instruct": 12.416666666666666
@@ -708,6 +734,7 @@
708
  "whisper_large_v3": 2.451098639578599,
709
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 2.8327095799289337,
710
  "phi_4_multimodal_instruct": 0.053138495633157125,
 
711
  "WavLLM_fairseq": 0.1695522548322915,
712
  "SALMONN_7B": 0.3649023706010388,
713
  "cascade_whisper_large_v3_llama_3_8b_instruct": 2.4245628096245917
@@ -733,6 +760,7 @@
733
  "Qwen2-Audio-7B-Instruct": 50.919591292758774,
734
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 24.647544968400585,
735
  "phi_4_multimodal_instruct": 47.86582401555663,
 
736
  "WavLLM_fairseq": 43.01199466903598,
737
  "SALMONN_7B": 57.75401069518716,
738
  "cascade_whisper_large_v3_llama_3_8b_instruct": 29.47134606841404
@@ -763,6 +791,7 @@
763
  "whisper_large_v3": 0.27026366524560785,
764
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.3035544573275043,
765
  "phi_4_multimodal_instruct": 0.44227061666711925,
 
766
  "WavLLM_fairseq": 0.7540934640345399,
767
  "SALMONN_7B": 0.6569229098215983,
768
  "cascade_whisper_large_v3_llama_3_8b_instruct": 0.29992939962527493
@@ -813,6 +842,7 @@
813
  "Qwen2-Audio-7B-Instruct": 45.75079872204473,
814
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 18.466453674121407,
815
  "phi_4_multimodal_instruct": 38.466453674121404,
 
816
  "WavLLM_fairseq": 29.840255591054312,
817
  "SALMONN_7B": 50.287539936102235,
818
  "cascade_whisper_large_v3_llama_3_8b_instruct": 17.380191693290733
@@ -829,6 +859,7 @@
829
  "whisper_large_v3": 0.06844171360300393,
830
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.07041669714480775,
831
  "phi_4_multimodal_instruct": 0.05739643527661961,
 
832
  "WavLLM_fairseq": 0.10077292565771828,
833
  "SALMONN_7B": 0.0925804013361617,
834
  "cascade_whisper_large_v3_llama_3_8b_instruct": 0.06922195401458074
@@ -841,6 +872,7 @@
841
  "Qwen2-Audio-7B-Instruct": 0.2165498391593041,
842
  "whisper_large_v3": 0.14602420615337386,
843
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.20140159998943682,
 
844
  "WavLLM_fairseq": 0.3792176325635977,
845
  "SALMONN_7B": 0.23699946689025367,
846
  "cascade_whisper_large_v3_llama_3_8b_instruct": 0.14540692118393275
@@ -866,6 +898,7 @@
866
  "Qwen2-Audio-7B-Instruct": 44.473684210526315,
867
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 18.88157894736842,
868
  "phi_4_multimodal_instruct": 35.13157894736842,
 
869
  "WavLLM_fairseq": 26.25,
870
  "SALMONN_7B": 47.30263157894737,
871
  "cascade_whisper_large_v3_llama_3_8b_instruct": 16.710526315789473
@@ -1014,6 +1047,7 @@
1014
  "Qwen2-Audio-7B-Instruct": 68.38333333333333,
1015
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 26.35,
1016
  "phi_4_multimodal_instruct": 51.68333333333334,
 
1017
  "WavLLM_fairseq": 49.06666666666666,
1018
  "SALMONN_7B": 59.766666666666666,
1019
  "cascade_whisper_large_v3_llama_3_8b_instruct": 36.016666666666666
@@ -1026,6 +1060,7 @@
1026
  "Qwen2-Audio-7B-Instruct": 80.04901960784315,
1027
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 88.57843137254902,
1028
  "phi_4_multimodal_instruct": 88.33333333333334,
 
1029
  "WavLLM_fairseq": 83.92156862745098,
1030
  "SALMONN_7B": 83.48039215686273,
1031
  "cascade_whisper_large_v3_llama_3_8b_instruct": 82.99019607843137
@@ -1055,6 +1090,7 @@
1055
  "Qwen2-Audio-7B-Instruct": 41.60919540229885,
1056
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 47.356321839080465,
1057
  "phi_4_multimodal_instruct": 43.524904214559385,
 
1058
  "WavLLM_fairseq": 41.57088122605364,
1059
  "SALMONN_7B": 30.536398467432953,
1060
  "cascade_whisper_large_v3_llama_3_8b_instruct": 36.81992337164751
@@ -1083,6 +1119,7 @@
1083
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.7824973031283711,
1084
  "gemini-1.5-flash": 0.9690871089536138,
1085
  "phi_4_multimodal_instruct": 0.7126483279395901,
 
1086
  "WavLLM_fairseq": 1.2913969795037756,
1087
  "SALMONN_7B": 1.2721817691477886,
1088
  "cascade_whisper_large_v3_llama_3_8b_instruct": 0.6848705501618123
@@ -1107,6 +1144,7 @@
1107
  "Qwen2-Audio-7B-Instruct": 51.6,
1108
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 74.0,
1109
  "phi_4_multimodal_instruct": 49.0,
 
1110
  "WavLLM_fairseq": 50.8,
1111
  "SALMONN_7B": 44.6,
1112
  "cascade_whisper_large_v3_llama_3_8b_instruct": 57.800000000000004
@@ -1122,6 +1160,7 @@
1122
  "Qwen2-Audio-7B-Instruct": 46.2,
1123
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 65.4,
1124
  "phi_4_multimodal_instruct": 52.599999999999994,
 
1125
  "WavLLM_fairseq": 49.400000000000006,
1126
  "SALMONN_7B": 24.2,
1127
  "cascade_whisper_large_v3_llama_3_8b_instruct": 57.199999999999996
 
22
  "whisper_large_v3": 0.8294532718704128,
23
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.4757667842702995,
24
  "phi_4_multimodal_instruct": 1.3868687388941825,
25
+ "seallms_audio_7b": 1.8960881769720068,
26
  "WavLLM_fairseq": 1.2058793232211378,
27
  "SALMONN_7B": 0.7757204295537071,
28
  "cascade_whisper_large_v3_llama_3_8b_instruct": 0.4715562308464886
 
35
  "Qwen2-Audio-7B-Instruct": 0.21342294856199182,
36
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.120421856260385,
37
  "phi_4_multimodal_instruct": 0.24508284335582894,
38
+ "seallms_audio_7b": 0.1444387454989207,
39
  "WavLLM_fairseq": 0.06399522524688675,
40
  "SALMONN_7B": 0.17175112770658157,
41
  "cascade_whisper_large_v3_llama_3_8b_instruct": 0.1388630786594543
 
62
  "whisper_large_v3": 14.673689493155793,
63
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 15.209998552437538,
64
  "phi_4_multimodal_instruct": 22.678131781242936,
65
+ "seallms_audio_7b": 18.79451062979056,
66
  "WavLLM_fairseq": 2.368659001743569,
67
  "SALMONN_7B": 5.296039450108202,
68
  "cascade_whisper_large_v3_llama_3_8b_instruct": 14.154700735606419
 
75
  "Qwen2-Audio-7B-Instruct": 53.6,
76
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 71.6,
77
  "phi_4_multimodal_instruct": 66.2,
78
+ "seallms_audio_7b": 58.2,
79
  "WavLLM_fairseq": 62.199999999999996,
80
  "SALMONN_7B": 46.8,
81
  "cascade_whisper_large_v3_llama_3_8b_instruct": 64.0
 
115
  "Qwen2-Audio-7B-Instruct": 71.60909856781802,
116
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 51.727042965459134,
117
  "phi_4_multimodal_instruct": 54.422914911541696,
118
+ "seallms_audio_7b": 63.184498736310026,
119
  "WavLLM_fairseq": 44.3133951137321,
120
  "SALMONN_7B": 50.88458298230834,
121
  "cascade_whisper_large_v3_llama_3_8b_instruct": 56.44481887110362
 
141
  "whisper_large_v3": 0.3171008846684522,
142
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.32988393799204613,
143
  "phi_4_multimodal_instruct": 0.3470091713334957,
144
+ "seallms_audio_7b": 0.290236182128074,
145
  "WavLLM_fairseq": 0.4463923382842302,
146
  "SALMONN_7B": 0.42346400454508565,
147
  "cascade_whisper_large_v3_llama_3_8b_instruct": 0.31912994075156237
 
209
  "whisper_large_v3": 0.1698509342851144,
210
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.1789273082575623,
211
  "phi_4_multimodal_instruct": 0.14552883606001388,
212
+ "seallms_audio_7b": 0.6259629515980555,
213
  "WavLLM_fairseq": 0.42541061709652933,
214
  "SALMONN_7B": 0.24872817713464365,
215
  "cascade_whisper_large_v3_llama_3_8b_instruct": 0.17467982364056267
 
222
  "Qwen2-Audio-7B-Instruct": 44.800000000000004,
223
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 72.2,
224
  "phi_4_multimodal_instruct": 30.8,
225
+ "seallms_audio_7b": 63.8,
226
  "WavLLM_fairseq": 19.2,
227
  "SALMONN_7B": 15.8,
228
  "cascade_whisper_large_v3_llama_3_8b_instruct": 63.0
 
250
  "Qwen2-Audio-7B-Instruct": 58.31395348837209,
251
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 73.11046511627907,
252
  "phi_4_multimodal_instruct": 68.40116279069767,
253
+ "seallms_audio_7b": 57.587209302325576,
254
  "WavLLM_fairseq": 58.54651162790698,
255
  "SALMONN_7B": 59.24418604651163,
256
  "cascade_whisper_large_v3_llama_3_8b_instruct": 64.94186046511628
 
309
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.5840399155162387,
310
  "gemini-1.5-flash": 1.1100431601824359,
311
  "phi_4_multimodal_instruct": 0.8529492791331231,
312
+ "seallms_audio_7b": 1.7106737273868193,
313
  "WavLLM_fairseq": 1.2204842511249197,
314
  "SALMONN_7B": 1.0189782362484312,
315
  "cascade_whisper_large_v3_llama_3_8b_instruct": 0.507882090054792
 
322
  "Qwen2-Audio-7B-Instruct": 53.9463601532567,
323
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 56.59003831417625,
324
  "phi_4_multimodal_instruct": 51.609195402298845,
325
+ "seallms_audio_7b": 52.1455938697318,
326
  "WavLLM_fairseq": 51.072796934865906,
327
  "SALMONN_7B": 41.7624521072797,
328
  "cascade_whisper_large_v3_llama_3_8b_instruct": 45.593869731800766
 
335
  "Qwen2-Audio-7B-Instruct": 39.6,
336
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 66.0,
337
  "phi_4_multimodal_instruct": 43.8,
338
+ "seallms_audio_7b": 45.0,
339
  "WavLLM_fairseq": 46.6,
340
  "SALMONN_7B": 36.6,
341
  "cascade_whisper_large_v3_llama_3_8b_instruct": 53.8
 
363
  "Qwen2-Audio-7B-Instruct": 61.56666666666667,
364
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 19.6,
365
  "phi_4_multimodal_instruct": 36.833333333333336,
366
+ "seallms_audio_7b": 30.5,
367
  "WavLLM_fairseq": 46.766666666666666,
368
  "SALMONN_7B": 42.733333333333334,
369
  "cascade_whisper_large_v3_llama_3_8b_instruct": 25.433333333333337
 
389
  "whisper_large_v3": 0.2143555471246589,
390
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.22881615619208825,
391
  "phi_4_multimodal_instruct": 0.22801359968481416,
392
+ "seallms_audio_7b": 0.5812260145043848,
393
  "WavLLM_fairseq": 0.39796588405247263,
394
  "SALMONN_7B": 0.34868891450584405,
395
  "cascade_whisper_large_v3_llama_3_8b_instruct": 0.22004640235805695
 
414
  "whisper_large_v3": 0.15887899737116104,
415
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.1448629161356777,
416
  "phi_4_multimodal_instruct": 0.24134627375003423,
417
+ "seallms_audio_7b": 0.5738685499413504,
418
  "WavLLM_fairseq": 0.6671766188447099,
419
  "SALMONN_7B": 0.3597423676988383,
420
  "cascade_whisper_large_v3_llama_3_8b_instruct": 0.15611126487402763
 
452
  "whisper_large_v3": 46.01512198258627,
453
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 46.80524126004861,
454
  "phi_4_multimodal_instruct": 0.36465303013961253,
455
+ "seallms_audio_7b": 43.98074943006231,
456
  "WavLLM_fairseq": 5.933522277713613,
457
  "SALMONN_7B": 26.89649039333571,
458
  "cascade_whisper_large_v3_llama_3_8b_instruct": 46.79924664837527
 
490
  "whisper_large_v3": 0.09459022434812692,
491
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.09948381629977261,
492
  "phi_4_multimodal_instruct": 0.09672866386388193,
493
+ "seallms_audio_7b": 0.13672725996455154,
494
  "WavLLM_fairseq": 0.15491778414546403,
495
  "SALMONN_7B": 0.10765150204693537,
496
  "cascade_whisper_large_v3_llama_3_8b_instruct": 0.09515429104337297
 
515
  "Qwen2-Audio-7B-Instruct": 33.8,
516
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 45.4,
517
  "phi_4_multimodal_instruct": 41.2,
518
+ "seallms_audio_7b": 43.0,
519
  "WavLLM_fairseq": 31.6,
520
  "SALMONN_7B": 9.0,
521
  "cascade_whisper_large_v3_llama_3_8b_instruct": 37.400000000000006
 
531
  "Qwen2-Audio-7B-Instruct": 0.9666666666666667,
532
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 7.633333333333334,
533
  "phi_4_multimodal_instruct": 0.5333333333333333,
534
+ "seallms_audio_7b": 15.633333333333333,
535
  "WavLLM_fairseq": 0.23333333333333336,
536
  "SALMONN_7B": 0.06666666666666667,
537
  "cascade_whisper_large_v3_llama_3_8b_instruct": 9.666666666666666
 
544
  "Qwen2-Audio-7B-Instruct": 92.80876494023903,
545
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 15.737051792828685,
546
  "phi_4_multimodal_instruct": 59.46215139442231,
547
+ "seallms_audio_7b": 66.43426294820716,
548
  "WavLLM_fairseq": 51.932270916334666,
549
  "SALMONN_7B": 81.31474103585658,
550
  "cascade_whisper_large_v3_llama_3_8b_instruct": 44.22310756972111
 
595
  "Qwen2-Audio-7B-Instruct": 66.49242028227914,
596
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 89.33612127548353,
597
  "phi_4_multimodal_instruct": 72.60846837428123,
598
+ "seallms_audio_7b": 75.6926293779404,
599
  "WavLLM_fairseq": 66.5446941975954,
600
  "SALMONN_7B": 56.455828541557764,
601
  "cascade_whisper_large_v3_llama_3_8b_instruct": 86.4610559330894
 
608
  "Qwen2-Audio-7B-Instruct": 40.4,
609
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 58.0,
610
  "phi_4_multimodal_instruct": 52.199999999999996,
611
+ "seallms_audio_7b": 49.400000000000006,
612
  "WavLLM_fairseq": 45.199999999999996,
613
  "SALMONN_7B": 17.2,
614
  "cascade_whisper_large_v3_llama_3_8b_instruct": 49.0
 
638
  "Qwen2-Audio-7B-Instruct": 42.0,
639
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 56.0,
640
  "phi_4_multimodal_instruct": 43.8,
641
+ "seallms_audio_7b": 45.599999999999994,
642
  "WavLLM_fairseq": 45.199999999999996,
643
  "SALMONN_7B": 40.599999999999994,
644
  "cascade_whisper_large_v3_llama_3_8b_instruct": 49.0
 
654
  "Qwen2-Audio-7B-Instruct": 24.8,
655
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 44.0,
656
  "phi_4_multimodal_instruct": 37.0,
657
+ "seallms_audio_7b": 35.4,
658
  "WavLLM_fairseq": 31.6,
659
  "SALMONN_7B": 7.0,
660
  "cascade_whisper_large_v3_llama_3_8b_instruct": 36.0
 
692
  "Qwen2-Audio-7B-Instruct": 0.19891712076314283,
693
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.05796819723943051,
694
  "phi_4_multimodal_instruct": 0.1757379026471828,
695
+ "seallms_audio_7b": 0.30423899385222564,
696
  "WavLLM_fairseq": 0.041732965094428545,
697
  "SALMONN_7B": 0.20994052484339956,
698
  "cascade_whisper_large_v3_llama_3_8b_instruct": 0.07953048457785493
 
708
  "Qwen2-Audio-7B-Instruct": 2.55,
709
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 26.016666666666666,
710
  "phi_4_multimodal_instruct": 3.5166666666666666,
711
+ "seallms_audio_7b": 3.5833333333333335,
712
  "WavLLM_fairseq": 2.6833333333333336,
713
  "SALMONN_7B": 2.5166666666666666,
714
  "cascade_whisper_large_v3_llama_3_8b_instruct": 12.416666666666666
 
734
  "whisper_large_v3": 2.451098639578599,
735
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 2.8327095799289337,
736
  "phi_4_multimodal_instruct": 0.053138495633157125,
737
+ "seallms_audio_7b": 0.06475917031217593,
738
  "WavLLM_fairseq": 0.1695522548322915,
739
  "SALMONN_7B": 0.3649023706010388,
740
  "cascade_whisper_large_v3_llama_3_8b_instruct": 2.4245628096245917
 
760
  "Qwen2-Audio-7B-Instruct": 50.919591292758774,
761
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 24.647544968400585,
762
  "phi_4_multimodal_instruct": 47.86582401555663,
763
+ "seallms_audio_7b": 53.03840544482256,
764
  "WavLLM_fairseq": 43.01199466903598,
765
  "SALMONN_7B": 57.75401069518716,
766
  "cascade_whisper_large_v3_llama_3_8b_instruct": 29.47134606841404
 
791
  "whisper_large_v3": 0.27026366524560785,
792
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.3035544573275043,
793
  "phi_4_multimodal_instruct": 0.44227061666711925,
794
+ "seallms_audio_7b": 1.0837293290249002,
795
  "WavLLM_fairseq": 0.7540934640345399,
796
  "SALMONN_7B": 0.6569229098215983,
797
  "cascade_whisper_large_v3_llama_3_8b_instruct": 0.29992939962527493
 
842
  "Qwen2-Audio-7B-Instruct": 45.75079872204473,
843
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 18.466453674121407,
844
  "phi_4_multimodal_instruct": 38.466453674121404,
845
+ "seallms_audio_7b": 53.73801916932908,
846
  "WavLLM_fairseq": 29.840255591054312,
847
  "SALMONN_7B": 50.287539936102235,
848
  "cascade_whisper_large_v3_llama_3_8b_instruct": 17.380191693290733
 
859
  "whisper_large_v3": 0.06844171360300393,
860
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.07041669714480775,
861
  "phi_4_multimodal_instruct": 0.05739643527661961,
862
+ "seallms_audio_7b": 0.17813863896813206,
863
  "WavLLM_fairseq": 0.10077292565771828,
864
  "SALMONN_7B": 0.0925804013361617,
865
  "cascade_whisper_large_v3_llama_3_8b_instruct": 0.06922195401458074
 
872
  "Qwen2-Audio-7B-Instruct": 0.2165498391593041,
873
  "whisper_large_v3": 0.14602420615337386,
874
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.20140159998943682,
875
+ "seallms_audio_7b": 0.369768551146351,
876
  "WavLLM_fairseq": 0.3792176325635977,
877
  "SALMONN_7B": 0.23699946689025367,
878
  "cascade_whisper_large_v3_llama_3_8b_instruct": 0.14540692118393275
 
898
  "Qwen2-Audio-7B-Instruct": 44.473684210526315,
899
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 18.88157894736842,
900
  "phi_4_multimodal_instruct": 35.13157894736842,
901
+ "seallms_audio_7b": 42.10526315789473,
902
  "WavLLM_fairseq": 26.25,
903
  "SALMONN_7B": 47.30263157894737,
904
  "cascade_whisper_large_v3_llama_3_8b_instruct": 16.710526315789473
 
1047
  "Qwen2-Audio-7B-Instruct": 68.38333333333333,
1048
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 26.35,
1049
  "phi_4_multimodal_instruct": 51.68333333333334,
1050
+ "seallms_audio_7b": 50.083333333333336,
1051
  "WavLLM_fairseq": 49.06666666666666,
1052
  "SALMONN_7B": 59.766666666666666,
1053
  "cascade_whisper_large_v3_llama_3_8b_instruct": 36.016666666666666
 
1060
  "Qwen2-Audio-7B-Instruct": 80.04901960784315,
1061
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 88.57843137254902,
1062
  "phi_4_multimodal_instruct": 88.33333333333334,
1063
+ "seallms_audio_7b": 83.52941176470588,
1064
  "WavLLM_fairseq": 83.92156862745098,
1065
  "SALMONN_7B": 83.48039215686273,
1066
  "cascade_whisper_large_v3_llama_3_8b_instruct": 82.99019607843137
 
1090
  "Qwen2-Audio-7B-Instruct": 41.60919540229885,
1091
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 47.356321839080465,
1092
  "phi_4_multimodal_instruct": 43.524904214559385,
1093
+ "seallms_audio_7b": 51.11111111111111,
1094
  "WavLLM_fairseq": 41.57088122605364,
1095
  "SALMONN_7B": 30.536398467432953,
1096
  "cascade_whisper_large_v3_llama_3_8b_instruct": 36.81992337164751
 
1119
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.7824973031283711,
1120
  "gemini-1.5-flash": 0.9690871089536138,
1121
  "phi_4_multimodal_instruct": 0.7126483279395901,
1122
+ "seallms_audio_7b": 1.0639495685005393,
1123
  "WavLLM_fairseq": 1.2913969795037756,
1124
  "SALMONN_7B": 1.2721817691477886,
1125
  "cascade_whisper_large_v3_llama_3_8b_instruct": 0.6848705501618123
 
1144
  "Qwen2-Audio-7B-Instruct": 51.6,
1145
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 74.0,
1146
  "phi_4_multimodal_instruct": 49.0,
1147
+ "seallms_audio_7b": 54.2,
1148
  "WavLLM_fairseq": 50.8,
1149
  "SALMONN_7B": 44.6,
1150
  "cascade_whisper_large_v3_llama_3_8b_instruct": 57.800000000000004
 
1160
  "Qwen2-Audio-7B-Instruct": 46.2,
1161
  "cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 65.4,
1162
  "phi_4_multimodal_instruct": 52.599999999999994,
1163
+ "seallms_audio_7b": 53.0,
1164
  "WavLLM_fairseq": 49.400000000000006,
1165
  "SALMONN_7B": 24.2,
1166
  "cascade_whisper_large_v3_llama_3_8b_instruct": 57.199999999999996