lmzjms's picture
Upload 1162 files
0b32ad6 verified
from .base import SequentialDataPipe
from .common_pipes import LoadAudio, SetOutputKeys
from .extract_feat_pipes import ExtractNpcFeat
from .masked_reconstruction_pipes import PrepareTargetFeat
from .valid_label_mask_pipes import LabelMaskFromLen
class PretrainNpcPipe(SequentialDataPipe):
"""
each item in the input dataset should have:
wav_path: str
"""
def __init__(
self,
output_keys: dict = None,
feat_type: str = "fbank",
feat_dim: int = 80,
frame_length: int = 25,
frame_shift: int = 10,
decode_wav: bool = False,
cmvn: bool = True,
audio_sample_rate: int = 16000,
audio_channel_reduction: str = "first",
n_jobs: int = 6,
):
"""
Args:
output_keys (dict): args for the output handle
feat_type (str): feature type
feat_dim (int): feature dimension
frame_length (int): window size in ms
frame_shift (int): hop size in ms
decode_wav (bool): whether to decode wav
cmvn (bool): whether to apply uttr.-wised CMVN on feature
audio_sample_rate (int): audio sample rate
audio_channel_reduction (str): "first" channel
n_jobs (int): number of workers
"""
output_keys = output_keys or dict(
x="source_feat",
label="target_feat",
label_mask="label_mask",
unique_name="id",
)
super().__init__(
LoadAudio(
n_jobs=n_jobs,
audio_sample_rate=audio_sample_rate,
audio_channel_reduction=audio_channel_reduction,
),
ExtractNpcFeat(
feat_type=feat_type,
feat_dim=feat_dim,
frame_length=frame_length,
frame_shift=frame_shift,
decode_wav=decode_wav,
cmvn=cmvn,
feat_name="source_feat",
),
LabelMaskFromLen(
target_feat_name="target_feat", label_mask_name="label_mask"
),
PrepareTargetFeat(
use_copy=True,
source_feat_name="source_feat",
target_feat_name="target_feat",
),
SetOutputKeys(output_keys=output_keys),
)