wav2vec2 / finetuning /preprocess.py
hoang1007
init
5381499
raw
history blame contribute delete
780 Bytes
import sys
sys.path.append("..")
import os
import argparse
from torch.utils.data import random_split
from src.datamodule import VLSP2020TarDataset, VLSP2020Dataset
def prepare_tar_dataset(data_dir: str, dest_dir: str):
dts = VLSP2020Dataset(data_dir)
train_set, val_set = random_split(dts, [42_000, 14_427])
VLSP2020TarDataset(os.path.join(dest_dir, "vlsp2020_train_set.tar")).convert(
train_set
)
VLSP2020TarDataset(os.path.join(dest_dir, "vlsp2020_val_set.tar")).convert(val_set)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--data_dir", type=str, required=True)
parser.add_argument("--dest_dir", type=str, required=True)
args = parser.parse_args()
prepare_tar_dataset(args.data_dir, args.dest_dir)