wavlm-large / s3prl_s3prl_main /s3prl /dataset /pretrain_npc_pipe.py

Upload 1162 files

0b32ad6 verified 5 months ago

2.37 kB

	from .base import SequentialDataPipe
	from .common_pipes import LoadAudio, SetOutputKeys
	from .extract_feat_pipes import ExtractNpcFeat
	from .masked_reconstruction_pipes import PrepareTargetFeat
	from .valid_label_mask_pipes import LabelMaskFromLen


	class PretrainNpcPipe(SequentialDataPipe):
	"""
	each item in the input dataset should have:
	wav_path: str
	"""

	def __init__(
	self,
	output_keys: dict = None,
	feat_type: str = "fbank",
	feat_dim: int = 80,
	frame_length: int = 25,
	frame_shift: int = 10,
	decode_wav: bool = False,
	cmvn: bool = True,
	audio_sample_rate: int = 16000,
	audio_channel_reduction: str = "first",
	n_jobs: int = 6,
	):
	"""
	Args:
	output_keys (dict): args for the output handle
	feat_type (str): feature type
	feat_dim (int): feature dimension
	frame_length (int): window size in ms
	frame_shift (int): hop size in ms
	decode_wav (bool): whether to decode wav
	cmvn (bool): whether to apply uttr.-wised CMVN on feature
	audio_sample_rate (int): audio sample rate
	audio_channel_reduction (str): "first" channel
	n_jobs (int): number of workers
	"""
	output_keys = output_keys or dict(
	x="source_feat",
	label="target_feat",
	label_mask="label_mask",
	unique_name="id",
	)

	super().__init__(
	LoadAudio(
	n_jobs=n_jobs,
	audio_sample_rate=audio_sample_rate,
	audio_channel_reduction=audio_channel_reduction,
	),
	ExtractNpcFeat(
	feat_type=feat_type,
	feat_dim=feat_dim,
	frame_length=frame_length,
	frame_shift=frame_shift,
	decode_wav=decode_wav,
	cmvn=cmvn,
	feat_name="source_feat",
	),
	LabelMaskFromLen(
	target_feat_name="target_feat", label_mask_name="label_mask"
	),
	PrepareTargetFeat(
	use_copy=True,
	source_feat_name="source_feat",
	target_feat_name="target_feat",
	),
	SetOutputKeys(output_keys=output_keys),
	)