import librosa import numpy as np import pandas as pd import torch from utmosv2.dataset._utils import ( extend_audio, get_dataset_map, load_audio, select_random_start, ) class MultiSpecDataset(torch.utils.data.Dataset): def __init__(self, cfg, data: pd.DataFrame, phase: str, transform=None): self.cfg = cfg self.data = data self.phase = phase self.transform = transform def __len__(self): return len(self.data) def __getitem__(self, idx): row = self.data.iloc[idx] file = row["file_path"] y = load_audio(self.cfg, file) specs = [] length = int(self.cfg.dataset.spec_frames.frame_sec * self.cfg.sr) y = extend_audio(self.cfg, y, length, type=self.cfg.dataset.spec_frames.extend) for _ in range(self.cfg.dataset.spec_frames.num_frames): y1 = select_random_start(y, length) for spec_cfg in self.cfg.dataset.specs: spec = _make_spctrogram(self.cfg, spec_cfg, y1) if self.cfg.dataset.spec_frames.mixup_inner: y2 = select_random_start(y, length) spec2 = _make_spctrogram(self.cfg, spec_cfg, y2) lmd = np.random.beta( self.cfg.dataset.spec_frames.mixup_alpha, self.cfg.dataset.spec_frames.mixup_alpha, ) spec = lmd * spec + (1 - lmd) * spec2 spec = np.stack([spec, spec, spec], axis=0) # spec = np.transpose(spec, (1, 2, 0)) spec = torch.tensor(spec, dtype=torch.float32) phase = "train" if self.phase == "train" else "valid" spec = self.transform[phase](spec) specs.append(spec) spec = torch.stack(specs).float() target = row["mos"] target = torch.tensor(target, dtype=torch.float32) return spec, target class MultiSpecExtDataset(MultiSpecDataset): def __init__(self, cfg, data: pd.DataFrame, phase: str, transform=None): super().__init__(cfg, data, phase, transform) self.dataset_map = get_dataset_map(cfg) def __getitem__(self, idx): spec, target = super().__getitem__(idx) d = np.zeros(len(self.dataset_map)) d[self.dataset_map[self.data.iloc[idx]["dataset"]]] = 1 d = torch.tensor(d, dtype=torch.float32) return spec, d, target def _make_spctrogram(cfg, spec_cfg, y: np.ndarray) -> np.ndarray: if spec_cfg.mode == "melspec": return _make_melspec(cfg, spec_cfg, y) elif spec_cfg.mode == "stft": return _make_stft(cfg, spec_cfg, y) else: raise NotImplementedError def _make_melspec(cfg, spec_cfg, y: np.ndarray) -> np.ndarray: spec = librosa.feature.melspectrogram( y=y, sr=cfg.sr, n_fft=spec_cfg.n_fft, hop_length=spec_cfg.hop_length, n_mels=spec_cfg.n_mels, ) spec = librosa.power_to_db(spec, ref=np.max) if spec_cfg.norm is not None: spec = (spec + spec_cfg.norm) / spec_cfg.norm return spec def _make_stft(cfg, spec_cfg, y: np.ndarray) -> np.ndarray: spec = librosa.stft(y=y, n_fft=spec_cfg.n_fft, hop_length=spec_cfg.hop_length) spec = np.abs(spec) spec = librosa.amplitude_to_db(spec) return spec