|
|
|
displayname2datasetname = { |
|
'LibriSpeech-Clean' : 'librispeech_test_clean', |
|
'LibriSpeech-Other' : 'librispeech_test_other', |
|
'CommonVoice-15-EN' : 'common_voice_15_en_test', |
|
'Peoples-Speech' : 'peoples_speech_test', |
|
'GigaSpeech-1' : 'gigaspeech_test', |
|
'Earnings-21' : 'earnings21_test', |
|
'Earnings-22' : 'earnings22_test', |
|
'TED-LIUM-3' : 'tedlium3_test', |
|
'TED-LIUM-3-LongForm' : 'tedlium3_long_form_test', |
|
'AISHELL-ASR-ZH' : 'aishell_asr_zh_test', |
|
'CoVoST2-EN-ID' : 'covost2_en_id_test', |
|
'CoVoST2-EN-ZH' : 'covost2_en_zh_test', |
|
'CoVoST2-EN-TA' : 'covost2_en_ta_test', |
|
'CoVoST2-ID-EN' : 'covost2_id_en_test', |
|
'CoVoST2-ZH-EN' : 'covost2_zh_en_test', |
|
'CoVoST2-TA-EN' : 'covost2_ta_en_test', |
|
'CN-College-Listen-MCQ': 'cn_college_listen_mcq_test', |
|
'DREAM-TTS-MCQ' : 'dream_tts_mcq_test', |
|
'SLUE-P2-SQA5' : 'slue_p2_sqa5_test', |
|
'Public-SG-Speech-QA' : 'public_sg_speech_qa_test', |
|
'Spoken-SQuAD' : 'spoken_squad_test', |
|
'OpenHermes-Audio' : 'openhermes_audio_test', |
|
'ALPACA-Audio' : 'alpaca_audio_test', |
|
'WavCaps' : 'wavcaps_test', |
|
'AudioCaps' : 'audiocaps_test', |
|
'Clotho-AQA' : 'clotho_aqa_test', |
|
'WavCaps-QA' : 'wavcaps_qa_test', |
|
'AudioCaps-QA' : 'audiocaps_qa_test', |
|
'VoxCeleb-Accent' : 'voxceleb_accent_test', |
|
'MNSC-AR-Sentence' : 'imda_ar_sentence', |
|
'MNSC-AR-Dialogue' : 'imda_ar_dialogue', |
|
'VoxCeleb-Gender' : 'voxceleb_gender_test', |
|
'IEMOCAP-Gender' : 'iemocap_gender_test', |
|
'IEMOCAP-Emotion' : 'iemocap_emotion_test', |
|
'MELD-Sentiment' : 'meld_sentiment_test', |
|
'MELD-Emotion' : 'meld_emotion_test', |
|
'MuChoMusic' : 'muchomusic_test', |
|
'MNSC-PART1-ASR' : 'imda_part1_asr_test', |
|
'MNSC-PART2-ASR' : 'imda_part2_asr_test', |
|
'MNSC-PART3-ASR' : 'imda_part3_30s_asr_test', |
|
'MNSC-PART4-ASR' : 'imda_part4_30s_asr_test', |
|
'MNSC-PART5-ASR' : 'imda_part5_30s_asr_test', |
|
'MNSC-PART6-ASR' : 'imda_part6_30s_asr_test', |
|
'MNSC-PART3-SQA' : 'imda_part3_30s_sqa_human_test', |
|
'MNSC-PART4-SQA' : 'imda_part4_30s_sqa_human_test', |
|
'MNSC-PART5-SQA' : 'imda_part5_30s_sqa_human_test', |
|
'MNSC-PART6-SQA' : 'imda_part6_30s_sqa_human_test', |
|
'MNSC-PART3-SDS' : 'imda_part3_30s_ds_human_test', |
|
'MNSC-PART4-SDS' : 'imda_part4_30s_ds_human_test', |
|
'MNSC-PART5-SDS' : 'imda_part5_30s_ds_human_test', |
|
'MNSC-PART6-SDS' : 'imda_part6_30s_ds_human_test', |
|
'SEAME-Dev-Man' : 'seame_dev_man', |
|
'SEAME-Dev-Sge' : 'seame_dev_sge', |
|
'MMAU-mini' : 'mmau_mini', |
|
'MMAU-mini-music' : 'mmau_mini_music', |
|
'MMAU-mini-sound' : 'mmau_mini_sound', |
|
'MMAU-mini-speech' : 'mmau_mini_speech', |
|
'GigaSpeech2-Indo' : 'gigaspeech2_indo', |
|
'GigaSpeech2-Thai' : 'gigaspeech2_thai', |
|
'GigaSpeech2-Viet' : 'gigaspeech2_viet', |
|
|
|
'CNA' : 'cna_test', |
|
'IDPC' : 'idpc_test', |
|
'Parliament' : 'parliament_test', |
|
'UKUS-News' : 'ukusnews_test', |
|
'Mediacorp' : 'mediacorp_test', |
|
'IDPC-Short' : 'idpc_short_test', |
|
'Parliament-Short': 'parliament_short_test', |
|
'UKUS-News-Short' : 'ukusnews_short_test', |
|
'Mediacorp-Short' : 'mediacorp_short_test', |
|
'YTB-ASR-Batch1' : 'ytb_asr_batch1', |
|
'YTB-ASR-Batch2' : 'ytb_asr_batch2', |
|
|
|
'YTB-SQA-Batch1': 'ytb_sqa_batch1', |
|
'YTB-SDS-Batch1': 'ytb_sds_batch1', |
|
'YTB-PQA-Batch1': 'ytb_pqa_batch1', |
|
|
|
} |
|
|
|
datasetname2diaplayname = {datasetname: displayname for displayname, datasetname in displayname2datasetname.items()} |
|
|
|
|
|
dataset_diaplay_information = { |
|
'LibriSpeech-Clean' : 'A clean, high-quality testset of the LibriSpeech dataset, used for ASR testing.', |
|
'LibriSpeech-Other' : 'A more challenging, noisier testset of the LibriSpeech dataset for ASR testing.', |
|
'CommonVoice-15-EN' : 'Test set from the Common Voice project, which is a crowd-sourced, multilingual speech dataset.', |
|
'Peoples-Speech' : 'A large-scale, open-source speech recognition dataset, with diverse accents and domains.', |
|
'GigaSpeech-1' : 'A large-scale ASR dataset with diverse audio sources like podcasts, interviews, etc.', |
|
'Earnings-21' : 'ASR test dataset focused on earnings calls from 2021, with professional speech and financial jargon.', |
|
'Earnings-22' : 'Similar to Earnings21, but covering earnings calls from 2022.', |
|
'TED-LIUM-3' : 'A test set derived from TED talks, covering diverse speakers and topics.', |
|
'TED-LIUM-3-LongForm' : 'A longer version of the TED-LIUM dataset, containing extended audio samples. This poses challenges to existing fusion methods in handling long audios. However, it provides benchmark for future development.', |
|
'AISHELL-ASR-ZH' : 'ASR test dataset for Mandarin Chinese, based on the Aishell dataset.', |
|
'CoVoST2-EN-ID' : 'CoVoST 2 dataset for speech translation from English to Indonesian.', |
|
'CoVoST2-EN-ZH' : 'CoVoST 2 dataset for speech translation from English to Chinese.', |
|
'CoVoST2-EN-TA' : 'CoVoST 2 dataset for speech translation from English to Tamil.', |
|
'CoVoST2-ID-EN' : 'CoVoST 2 dataset for speech translation from Indonesian to English.', |
|
'CoVoST2-ZH-EN' : 'CoVoST 2 dataset for speech translation from Chinese to English.', |
|
'CoVoST2-TA-EN' : 'CoVoST 2 dataset for speech translation from Tamil to English.', |
|
'CN-College-Listen-MCQ': 'Chinese College English Listening Test, with multiple-choice questions.', |
|
'DREAM-TTS-MCQ' : 'DREAM dataset for spoken question-answering, derived from textual data and synthesized speech.', |
|
'SLUE-P2-SQA5' : 'Spoken Language Understanding Evaluation (SLUE) dataset, part 2, focused on QA tasks.', |
|
'Public-SG-Speech-QA' : 'Public dataset for speech-based question answering, gathered from Singapore.', |
|
'Spoken-SQuAD' : 'Spoken SQuAD dataset, based on the textual SQuAD dataset, converted into audio.', |
|
'OpenHermes-Audio' : 'Test set for spoken instructions. Synthesized from the OpenHermes dataset.', |
|
'ALPACA-Audio' : 'Spoken version of the ALPACA dataset, used for evaluating instruction following in audio.', |
|
'WavCaps' : 'WavCaps is a dataset for testing audio captioning, where models generate textual descriptions of audio clips.', |
|
'AudioCaps' : 'AudioCaps dataset, used for generating captions from general audio events.', |
|
'Clotho-AQA' : 'Clotho dataset adapted for audio-based question answering, containing audio clips and questions.', |
|
'WavCaps-QA' : 'Question-answering test dataset derived from WavCaps, focusing on audio content.', |
|
'AudioCaps-QA' : 'AudioCaps adapted for question-answering tasks, using audio events as input for Q&A.', |
|
'VoxCeleb-Accent' : 'Test dataset for accent recognition, based on VoxCeleb, a large speaker identification dataset.', |
|
'MNSC-AR-Sentence' : 'Accent recognition based on the IMDA NSC dataset, focusing on sentence-level accents.', |
|
'MNSC-AR-Dialogue' : 'Accent recognition based on the IMDA NSC dataset, focusing on dialogue-level accents.', |
|
'VoxCeleb-Gender' : 'Test dataset for gender classification, also derived from VoxCeleb.', |
|
'IEMOCAP-Gender' : 'Gender classification based on the IEMOCAP dataset.', |
|
'IEMOCAP-Emotion' : 'Emotion recognition test data from the IEMOCAP dataset, focusing on identifying emotions in speech.', |
|
'MELD-Sentiment' : 'Sentiment recognition from speech using the MELD dataset, classifying positive, negative, or neutral sentiments.', |
|
'MELD-Emotion' : 'Emotion classification in speech using MELD, detecting specific emotions like happiness, anger, etc.', |
|
'MuChoMusic' : 'Test dataset for music understanding, from paper: MuChoMusic: Evaluating Music Understanding in Multimodal Audio-Language Models.', |
|
'MNSC-PART1-ASR' : 'Speech recognition test data from the IMDA NSC project, Part 1.', |
|
'MNSC-PART2-ASR' : 'Speech recognition test data from the IMDA NSC project, Part 2.', |
|
'MNSC-PART3-ASR' : 'Speech recognition test data from the IMDA NSC project, Part 3.', |
|
'MNSC-PART4-ASR' : 'Speech recognition test data from the IMDA NSC project, Part 4.', |
|
'MNSC-PART5-ASR' : 'Speech recognition test data from the IMDA NSC project, Part 5.', |
|
'MNSC-PART6-ASR' : 'Speech recognition test data from the IMDA NSC project, Part 6.', |
|
'MNSC-PART3-SQA' : 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 3.', |
|
'MNSC-PART4-SQA' : 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 4.', |
|
'MNSC-PART5-SQA' : 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 5.', |
|
'MNSC-PART6-SQA' : 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 6.', |
|
'MNSC-PART3-SDS' : 'Multitak National Speech Corpus (MNSC) dataset, dialogue summarization task, Part 3.', |
|
'MNSC-PART4-SDS' : 'Multitak National Speech Corpus (MNSC) dataset, dialogue summarization task, Part 4.', |
|
'MNSC-PART5-SDS' : 'Multitak National Speech Corpus (MNSC) dataset, dialogue summarization task, Part 5.', |
|
'MNSC-PART6-SDS' : 'Multitak National Speech Corpus (MNSC) dataset, dialogue summarization task, Part 6.', |
|
'SEAME-Dev-Man' : 'SEAME dataset, English-Mandarin Code-swithcing', |
|
'SEAME-Dev-Sge' : 'SEAME dataset, English-Mandarin Code-swithcing', |
|
'MMAU-mini' : 'MMAU Dataset, Mini version, MMAU: A Massive Multi-Task Audio Understanding and Reasoning Benchmark', |
|
'MMAU-mini-music' : 'MMAU Dataset, Mini version, MMAU: A Massive Multi-Task Audio Understanding and Reasoning Benchmark', |
|
'MMAU-mini-sound' : 'MMAU Dataset, Mini version, MMAU: A Massive Multi-Task Audio Understanding and Reasoning Benchmark', |
|
'MMAU-mini-speech' : 'MMAU Dataset, Mini version, MMAU: A Massive Multi-Task Audio Understanding and Reasoning Benchmark', |
|
'GigaSpeech2-Indo' : 'GigaSpeech ASR dataset for Indonesian.', |
|
'GigaSpeech2-Thai' : 'GigaSpeech ASR dataset for Thai.', |
|
'GigaSpeech2-Viet' : 'GigaSpeech ASR dataset for Vietnamese.', |
|
|
|
|
|
'CNA' : 'Under Development', |
|
'IDPC' : 'Under Development', |
|
'Parliament' : 'Under Development', |
|
'UKUS-News' : 'Under Development', |
|
'Mediacorp' : 'Under Development', |
|
'IDPC-Short' : 'Under Development', |
|
'Parliament-Short': 'Under Development', |
|
'UKUS-News-Short' : 'Under Development', |
|
'Mediacorp-Short' : 'Under Development', |
|
'YTB-ASR-Batch1' : 'Under Development', |
|
'YTB-ASR-Batch2' : 'Under Development', |
|
|
|
'YTB-SQA-Batch1': 'Under Development', |
|
'YTB-SDS-Batch1': 'Under Development', |
|
'YTB-PQA-Batch1': 'Under Development', |
|
} |
|
|
|
|
|
|
|
metrics_info = { |
|
'wer' : 'Word Error Rate (WER) - The Lower, the better.', |
|
'llama3_70b_judge': 'Model-as-a-Judge Peformance. Using LLAMA-3-70B. Scale from 0-100. The higher, the better.', |
|
'meteor' : 'METEOR Score. The higher, the better.', |
|
'bleu' : 'BLEU Score. The higher, the better.', |
|
'string_match' : 'From MMAU paper, after model generating the answer, the correctness is determined by string matching algorithm. https://github.com/Sakshi113/MMAU/blob/main/evaluation.py', |
|
'gpt4o_judge' : 'Model-as-a-Judge Peformance. Using GPT4o. Scale from 0-100. The higher, the better. For multiple-choice questions, it reflects accuracy.', |
|
} |
|
|
|
|