wavlm-large / s3prl_s3prl_main /test /test_librispeech.py
lmzjms's picture
Upload 1162 files
0b32ad6 verified
import pytest
from dotenv import dotenv_values
from s3prl.dataio.corpus.librilight import LibriLight
from s3prl.dataio.corpus.librispeech import LibriSpeech
libri_stats = {
"train-clean-100": 28539,
"train-clean-360": 104014,
"train-other-500": 148688,
"dev-clean": 2703,
"dev-other": 2864,
"test-clean": 2620,
"test-other": 2939,
}
@pytest.mark.corpus
def test_librispeech_dataset():
config = dotenv_values()
dataset_root = config["LibriSpeech"]
dataset = LibriSpeech(
dataset_root,
train_split=[
"train-clean-100",
"train-clean-360",
], # FIXME (Leo): I temporary do not have space for train-other-500 ...
valid_split=["dev-clean", "dev-other"],
test_split=["test-clean", "test-other"],
)
data = dataset.all_data
assert len(data) == 292367 - libri_stats["train-other-500"]
@pytest.mark.corpus
def test_librilight():
config = dotenv_values()
train_corpus = LibriLight(config["LibriLight"])
eval_corpus = LibriSpeech(config["LibriSpeech"], 4, [])
train_data = train_corpus.all_data
_, valid_data, test_data = eval_corpus.data_split
assert len(train_data) == 48