AudioBench-Leaderboard-Extend

Running

File size: 12,003 Bytes


displayname2datasetname = {
    'LibriSpeech-Clean'    : 'librispeech_test_clean',
    'LibriSpeech-Other'    : 'librispeech_test_other',
    'CommonVoice-15-EN'    : 'common_voice_15_en_test',
    'Peoples-Speech'       : 'peoples_speech_test',
    'GigaSpeech-1'         : 'gigaspeech_test',
    'Earnings-21'          : 'earnings21_test',
    'Earnings-22'          : 'earnings22_test',
    'TED-LIUM-3'           : 'tedlium3_test',
    'TED-LIUM-3-LongForm'  : 'tedlium3_long_form_test',
    'AISHELL-ASR-ZH'       : 'aishell_asr_zh_test',
    'CoVoST2-EN-ID'        : 'covost2_en_id_test',
    'CoVoST2-EN-ZH'        : 'covost2_en_zh_test',
    'CoVoST2-EN-TA'        : 'covost2_en_ta_test',
    'CoVoST2-ID-EN'        : 'covost2_id_en_test',
    'CoVoST2-ZH-EN'        : 'covost2_zh_en_test',
    'CoVoST2-TA-EN'        : 'covost2_ta_en_test',
    'CN-College-Listen-MCQ': 'cn_college_listen_mcq_test',
    'DREAM-TTS-MCQ'        : 'dream_tts_mcq_test',
    'SLUE-P2-SQA5'         : 'slue_p2_sqa5_test',
    'Public-SG-Speech-QA'  : 'public_sg_speech_qa_test',
    'Spoken-SQuAD'         : 'spoken_squad_test',
    'OpenHermes-Audio'     : 'openhermes_audio_test',
    'ALPACA-Audio'         : 'alpaca_audio_test',
    'WavCaps'              : 'wavcaps_test',
    'AudioCaps'            : 'audiocaps_test',
    'Clotho-AQA'           : 'clotho_aqa_test',
    'WavCaps-QA'           : 'wavcaps_qa_test',
    'AudioCaps-QA'         : 'audiocaps_qa_test',
    'VoxCeleb-Accent'      : 'voxceleb_accent_test',
    'MNSC-AR-Sentence'     : 'imda_ar_sentence',
    'MNSC-AR-Dialogue'     : 'imda_ar_dialogue',
    'VoxCeleb-Gender'      : 'voxceleb_gender_test',
    'IEMOCAP-Gender'       : 'iemocap_gender_test',
    'IEMOCAP-Emotion'      : 'iemocap_emotion_test',
    'MELD-Sentiment'       : 'meld_sentiment_test',
    'MELD-Emotion'         : 'meld_emotion_test',
    'MuChoMusic'           : 'muchomusic_test',
    'MNSC-PART1-ASR'       : 'imda_part1_asr_test',
    'MNSC-PART2-ASR'       : 'imda_part2_asr_test',
    'MNSC-PART3-ASR'       : 'imda_part3_30s_asr_test',
    'MNSC-PART4-ASR'       : 'imda_part4_30s_asr_test',
    'MNSC-PART5-ASR'       : 'imda_part5_30s_asr_test',
    'MNSC-PART6-ASR'       : 'imda_part6_30s_asr_test',
    'MNSC-PART3-SQA'       : 'imda_part3_30s_sqa_human_test',
    'MNSC-PART4-SQA'       : 'imda_part4_30s_sqa_human_test',
    'MNSC-PART5-SQA'       : 'imda_part5_30s_sqa_human_test',
    'MNSC-PART6-SQA'       : 'imda_part6_30s_sqa_human_test',
    'MNSC-PART3-SDS'       : 'imda_part3_30s_ds_human_test',
    'MNSC-PART4-SDS'       : 'imda_part4_30s_ds_human_test',
    'MNSC-PART5-SDS'       : 'imda_part5_30s_ds_human_test',
    'MNSC-PART6-SDS'       : 'imda_part6_30s_ds_human_test',
    'SEAME-Dev-Man'        : 'seame_dev_man',
    'SEAME-Dev-Sge'        : 'seame_dev_sge',
    'MMAU-mini'            : 'mmau_mini',
    'MMAU-mini-music'      : 'mmau_mini_music',
    'MMAU-mini-sound'      : 'mmau_mini_sound',
    'MMAU-mini-speech'     : 'mmau_mini_speech',
    'GigaSpeech2-Indo'     : 'gigaspeech2_indo',
    'GigaSpeech2-Thai'     : 'gigaspeech2_thai',
    'GigaSpeech2-Viet'     : 'gigaspeech2_viet',

    'CNA'             : 'cna_test',
    'IDPC'            : 'idpc_test',
    'Parliament'      : 'parliament_test',
    'UKUS-News'       : 'ukusnews_test',
    'Mediacorp'       : 'mediacorp_test',
    'IDPC-Short'      : 'idpc_short_test',
    'Parliament-Short': 'parliament_short_test',
    'UKUS-News-Short' : 'ukusnews_short_test',
    'Mediacorp-Short' : 'mediacorp_short_test',
    'YTB-ASR-Batch1'  : 'ytb_asr_batch1',
    'YTB-ASR-Batch2'  : 'ytb_asr_batch2',

    'YTB-SQA-Batch1': 'ytb_sqa_batch1',
    'YTB-SDS-Batch1': 'ytb_sds_batch1',
    'YTB-PQA-Batch1': 'ytb_pqa_batch1',

}

datasetname2diaplayname = {datasetname: displayname for displayname, datasetname in displayname2datasetname.items()}


dataset_diaplay_information = {
    'LibriSpeech-Clean'    : 'A clean, high-quality testset of the LibriSpeech dataset, used for ASR testing.',
    'LibriSpeech-Other'    : 'A more challenging, noisier testset of the LibriSpeech dataset for ASR testing.',
    'CommonVoice-15-EN'    : 'Test set from the Common Voice project, which is a crowd-sourced, multilingual speech dataset.',
    'Peoples-Speech'       : 'A large-scale, open-source speech recognition dataset, with diverse accents and domains.',
    'GigaSpeech-1'         : 'A large-scale ASR dataset with diverse audio sources like podcasts, interviews, etc.',
    'Earnings-21'          : 'ASR test dataset focused on earnings calls from 2021, with professional speech and financial jargon.',
    'Earnings-22'          : 'Similar to Earnings21, but covering earnings calls from 2022.',
    'TED-LIUM-3'           : 'A test set derived from TED talks, covering diverse speakers and topics.',
    'TED-LIUM-3-LongForm'  : 'A longer version of the TED-LIUM dataset, containing extended audio samples. This poses challenges to existing fusion methods in handling long audios. However, it provides benchmark for future development.',
    'AISHELL-ASR-ZH'       : 'ASR test dataset for Mandarin Chinese, based on the Aishell dataset.',
    'CoVoST2-EN-ID'        : 'CoVoST 2 dataset for speech translation from English to Indonesian.',
    'CoVoST2-EN-ZH'        : 'CoVoST 2 dataset for speech translation from English to Chinese.',
    'CoVoST2-EN-TA'        : 'CoVoST 2 dataset for speech translation from English to Tamil.',
    'CoVoST2-ID-EN'        : 'CoVoST 2 dataset for speech translation from Indonesian to English.',
    'CoVoST2-ZH-EN'        : 'CoVoST 2 dataset for speech translation from Chinese to English.',
    'CoVoST2-TA-EN'        : 'CoVoST 2 dataset for speech translation from Tamil to English.',
    'CN-College-Listen-MCQ': 'Chinese College English Listening Test, with multiple-choice questions.',
    'DREAM-TTS-MCQ'        : 'DREAM dataset for spoken question-answering, derived from textual data and synthesized speech.',
    'SLUE-P2-SQA5'         : 'Spoken Language Understanding Evaluation (SLUE) dataset, part 2, focused on QA tasks.',
    'Public-SG-Speech-QA'  : 'Public dataset for speech-based question answering, gathered from Singapore.',
    'Spoken-SQuAD'         : 'Spoken SQuAD dataset, based on the textual SQuAD dataset, converted into audio.',
    'OpenHermes-Audio'     : 'Test set for spoken instructions. Synthesized from the OpenHermes dataset.',
    'ALPACA-Audio'         : 'Spoken version of the ALPACA dataset, used for evaluating instruction following in audio.',
    'WavCaps'              : 'WavCaps is a dataset for testing audio captioning, where models generate textual descriptions of audio clips.',
    'AudioCaps'            : 'AudioCaps dataset, used for generating captions from general audio events.',
    'Clotho-AQA'           : 'Clotho dataset adapted for audio-based question answering, containing audio clips and questions.',
    'WavCaps-QA'           : 'Question-answering test dataset derived from WavCaps, focusing on audio content.',
    'AudioCaps-QA'         : 'AudioCaps adapted for question-answering tasks, using audio events as input for Q&A.',
    'VoxCeleb-Accent'      : 'Test dataset for accent recognition, based on VoxCeleb, a large speaker identification dataset.',
    'MNSC-AR-Sentence'     : 'Accent recognition based on the IMDA NSC dataset, focusing on sentence-level accents.',
    'MNSC-AR-Dialogue'     : 'Accent recognition based on the IMDA NSC dataset, focusing on dialogue-level accents.',
    'VoxCeleb-Gender'      : 'Test dataset for gender classification, also derived from VoxCeleb.',
    'IEMOCAP-Gender'       : 'Gender classification based on the IEMOCAP dataset.',
    'IEMOCAP-Emotion'      : 'Emotion recognition test data from the IEMOCAP dataset, focusing on identifying emotions in speech.',
    'MELD-Sentiment'       : 'Sentiment recognition from speech using the MELD dataset, classifying positive, negative, or neutral sentiments.',
    'MELD-Emotion'         : 'Emotion classification in speech using MELD, detecting specific emotions like happiness, anger, etc.',
    'MuChoMusic'           : 'Test dataset for music understanding, from paper: MuChoMusic: Evaluating Music Understanding in Multimodal Audio-Language Models.',
    'MNSC-PART1-ASR'       : 'Speech recognition test data from the IMDA NSC project, Part 1.',
    'MNSC-PART2-ASR'       : 'Speech recognition test data from the IMDA NSC project, Part 2.',
    'MNSC-PART3-ASR'       : 'Speech recognition test data from the IMDA NSC project, Part 3.',
    'MNSC-PART4-ASR'       : 'Speech recognition test data from the IMDA NSC project, Part 4.',
    'MNSC-PART5-ASR'       : 'Speech recognition test data from the IMDA NSC project, Part 5.',
    'MNSC-PART6-ASR'       : 'Speech recognition test data from the IMDA NSC project, Part 6.',
    'MNSC-PART3-SQA'       : 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 3.',
    'MNSC-PART4-SQA'       : 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 4.',
    'MNSC-PART5-SQA'       : 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 5.',
    'MNSC-PART6-SQA'       : 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 6.',
    'MNSC-PART3-SDS'       : 'Multitak National Speech Corpus (MNSC) dataset, dialogue summarization task, Part 3.',
    'MNSC-PART4-SDS'       : 'Multitak National Speech Corpus (MNSC) dataset, dialogue summarization task, Part 4.',
    'MNSC-PART5-SDS'       : 'Multitak National Speech Corpus (MNSC) dataset, dialogue summarization task, Part 5.',
    'MNSC-PART6-SDS'       : 'Multitak National Speech Corpus (MNSC) dataset, dialogue summarization task, Part 6.',
    'SEAME-Dev-Man'        : 'SEAME dataset, English-Mandarin Code-swithcing',
    'SEAME-Dev-Sge'        : 'SEAME dataset, English-Mandarin Code-swithcing',
    'MMAU-mini'            : 'MMAU Dataset, Mini version, MMAU: A Massive Multi-Task Audio Understanding and Reasoning Benchmark',
    'MMAU-mini-music'      : 'MMAU Dataset, Mini version, MMAU: A Massive Multi-Task Audio Understanding and Reasoning Benchmark',
    'MMAU-mini-sound'      : 'MMAU Dataset, Mini version, MMAU: A Massive Multi-Task Audio Understanding and Reasoning Benchmark',
    'MMAU-mini-speech'     : 'MMAU Dataset, Mini version, MMAU: A Massive Multi-Task Audio Understanding and Reasoning Benchmark',
    'GigaSpeech2-Indo'     : 'GigaSpeech ASR dataset for Indonesian.',
    'GigaSpeech2-Thai'     : 'GigaSpeech ASR dataset for Thai.',
    'GigaSpeech2-Viet'     : 'GigaSpeech ASR dataset for Vietnamese.',


    'CNA'             : 'Under Development',
    'IDPC'            : 'Under Development',
    'Parliament'      : 'Under Development',
    'UKUS-News'       : 'Under Development',
    'Mediacorp'       : 'Under Development',
    'IDPC-Short'      : 'Under Development',
    'Parliament-Short': 'Under Development',
    'UKUS-News-Short' : 'Under Development',
    'Mediacorp-Short' : 'Under Development',
    'YTB-ASR-Batch1'  : 'Under Development',
    'YTB-ASR-Batch2'  : 'Under Development',

    'YTB-SQA-Batch1': 'Under Development',
    'YTB-SDS-Batch1': 'Under Development',
    'YTB-PQA-Batch1': 'Under Development',
    }



metrics_info = {
    'wer'             : 'Word Error Rate (WER) - The Lower, the better.',
    'llama3_70b_judge': 'Model-as-a-Judge Peformance. Using LLAMA-3-70B. Scale from 0-100. The higher, the better.',
    'meteor'          : 'METEOR Score. The higher, the better.',
    'bleu'            : 'BLEU Score. The higher, the better.',
    'string_match'    : 'From MMAU paper, after model generating the answer, the correctness is determined by string matching algorithm. https://github.com/Sakshi113/MMAU/blob/main/evaluation.py',
    'gpt4o_judge'    : 'Model-as-a-Judge Peformance. Using GPT4o. Scale from 0-100. The higher, the better. For multiple-choice questions, it reflects accuracy.',
}