import importlib import json import os from typing import Any import torch from torch.utils.data import Dataset, DataLoader, random_split, ConcatDataset, Subset import numpy as np import pandas as pd import torchaudio as ta import pytorch_lightning as pl from glob import iglob from preprocessing.preprocess import ( fix_dance_rating_counts, get_unique_labels, has_valid_audio, url_to_filename, vectorize_label_probs, vectorize_multi_label, ) class SongDataset(Dataset): def __init__( self, audio_paths: list[str], dance_labels: list[np.ndarray], audio_start_offset=6, # seconds audio_window_duration=6, # seconds audio_window_jitter=1.0, # seconds audio_durations=None, target_sample_rate=16000, ): assert ( audio_window_duration > audio_window_jitter ), "Jitter should be a small fraction of the audio window duration." self.audio_paths = audio_paths self.dance_labels = dance_labels # Added to limit file I/O if audio_durations is None: audio_metadata = [ta.info(audio) for audio in audio_paths] self.audio_durations = [ meta.num_frames / meta.sample_rate for meta in audio_metadata ] self.sample_rate = audio_metadata[ 0 ].sample_rate # assuming same sample rate else: self.audio_durations = audio_durations self.sample_rate = ta.info( audio_paths[0] ).sample_rate # assuming same sample rate self.audio_window_duration = int(audio_window_duration) self.audio_start_offset = audio_start_offset self.audio_window_jitter = audio_window_jitter self.target_sample_rate = target_sample_rate def __len__(self): return int( sum( max(duration - self.audio_start_offset, 0) // self.audio_window_duration for duration in self.audio_durations ) ) def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]: if isinstance(idx, list): return [ (self._waveform_from_index(i), self._label_from_index(i)) for i in idx ] waveform = self._waveform_from_index(idx) dance_labels = self._label_from_index(idx) return waveform, dance_labels def _idx2audio_idx(self, idx: int) -> int: return self._get_audio_loc_from_idx(idx)[0] def _get_audio_loc_from_idx(self, idx: int) -> tuple[int, int]: """ Converts dataset index to the indices that reference the target audio path and window offset. """ total_slices = 0 for audio_index, duration in enumerate(self.audio_durations): audio_slices = max( (duration - self.audio_start_offset) // self.audio_window_duration, 1 ) if total_slices + audio_slices > idx: frame_index = idx - total_slices return audio_index, frame_index total_slices += audio_slices def get_label_weights(self): n_examples, n_classes = self.dance_labels.shape weights = n_examples / (n_classes * sum(self.dance_labels)) weights[np.isinf(weights)] = 0.0 return torch.from_numpy(weights) def _backtrace_audio_path(self, index: int) -> str: return self.audio_paths[self._idx2audio_idx(index)] def _validate_output(self, x, y): is_finite = not torch.any(torch.isinf(x)) is_numerical = not torch.any(torch.isnan(x)) has_data = torch.any(x != 0.0) is_binary = len(torch.unique(y)) < 3 return all((is_finite, is_numerical, has_data, is_binary)) def _waveform_from_index(self, idx: int) -> torch.Tensor: audio_index, frame_index = self._get_audio_loc_from_idx(idx) audio_filepath = self.audio_paths[audio_index] num_windows = self.audio_durations[audio_index] // self.audio_window_duration jitter_start = -self.audio_window_jitter if frame_index > 0 else 0.0 jitter_end = self.audio_window_jitter if frame_index != num_windows - 1 else 0.0 jitter = int( torch.FloatTensor(1).uniform_(jitter_start, jitter_end) * self.sample_rate ) frame_offset = int( frame_index * self.audio_window_duration * self.sample_rate + jitter + self.audio_start_offset * self.sample_rate ) num_frames = self.sample_rate * self.audio_window_duration waveform, sample_rate = ta.load( audio_filepath, frame_offset=frame_offset, num_frames=num_frames ) waveform = ta.functional.resample( waveform, orig_freq=sample_rate, new_freq=self.target_sample_rate ) return waveform def _label_from_index(self, idx: int) -> torch.Tensor: return torch.from_numpy(self.dance_labels[self._idx2audio_idx(idx)]) class HuggingFaceDatasetWrapper(Dataset): """ Makes a standard PyTorch Dataset compatible with a HuggingFace Trainer. """ def __init__(self, dataset, *args, **kwargs): super().__init__(*args, **kwargs) self.dataset = dataset self.pipeline = [] def __getitem__(self, idx: int) -> dict[str, torch.Tensor]: x, y = self.dataset[idx] if len(self.pipeline) > 0: for fn in self.pipeline: x = fn(x) dance_labels = y.argmax() return { "input_values": x["input_values"][0] if hasattr(x, "input_values") else x, "label": dance_labels, } def __len__(self): return len(self.dataset) def append_to_pipeline(self, fn): """ Adds a preprocessing step to the dataset. """ self.pipeline.append(fn) class BestBallroomDataset(Dataset): def __init__( self, audio_dir="data/ballroom-songs", class_list=None, **kwargs ) -> None: super().__init__() song_paths, labels = self.get_examples(audio_dir, class_list) with open(os.path.join(audio_dir, "audio_durations.json"), "r") as f: durations = json.load(f) durations = {os.path.join(audio_dir, filepath): duration for filepath, duration in durations.items()} audio_durations = [durations[song] for song in song_paths] self.song_dataset = SongDataset( song_paths, labels, audio_durations=audio_durations, **kwargs ) def __getitem__(self, index) -> tuple[torch.Tensor, torch.Tensor]: return self.song_dataset[index] def __len__(self): return len(self.song_dataset) def get_examples(self, audio_dir, class_list=None): dances = set( f for f in os.listdir(audio_dir) if os.path.isdir(os.path.join(audio_dir, f)) ) common_dances = dances if class_list is not None: common_dances = dances & set(class_list) dances = class_list dances = np.array(sorted(dances)) song_paths = [] labels = [] for dance in common_dances: dance_label = (dances == dance).astype("float32") folder_path = os.path.join(audio_dir, dance) folder_contents = [f for f in os.listdir(folder_path) if f.endswith(".wav")] song_paths.extend(os.path.join(folder_path, f) for f in folder_contents) labels.extend([dance_label] * len(folder_contents)) return np.array(song_paths), np.stack(labels) class Music4DanceDataset(Dataset): def __init__( self, song_data_path, song_audio_path, class_list=None, multi_label=True, min_votes=1, **kwargs, ) -> None: super().__init__() df = pd.read_csv(song_data_path) song_paths, labels = get_music4dance_examples( df, song_audio_path, class_list=class_list, multi_label=multi_label, min_votes=min_votes, ) self.song_dataset = SongDataset( song_paths, labels, audio_durations=[30.0] * len(song_paths), **kwargs, ) def __getitem__(self, index) -> tuple[torch.Tensor, torch.Tensor]: return self.song_dataset[index] def __len__(self): return len(self.song_dataset) def get_music4dance_examples( df: pd.DataFrame, audio_dir: str, class_list=None, multi_label=True, min_votes=1 ) -> tuple[np.ndarray, np.ndarray]: sampled_songs = df[has_valid_audio(df["Sample"], audio_dir)].copy(deep=True) sampled_songs["DanceRating"] = fix_dance_rating_counts(sampled_songs["DanceRating"]) if class_list is not None: class_list = set(class_list) sampled_songs["DanceRating"] = sampled_songs["DanceRating"].apply( lambda labels: {k: v for k, v in labels.items() if k in class_list} if not pd.isna(labels) and any(label in class_list and amt > 0 for label, amt in labels.items()) else np.nan ) sampled_songs = sampled_songs.dropna(subset=["DanceRating"]) vote_mask = sampled_songs["DanceRating"].apply( lambda dances: any(votes >= min_votes for votes in dances.values()) ) sampled_songs = sampled_songs[vote_mask] labels = sampled_songs["DanceRating"].apply( lambda dances: { dance: votes for dance, votes in dances.items() if votes >= min_votes } ) unique_labels = np.array(get_unique_labels(labels)) vectorizer = vectorize_multi_label if multi_label else vectorize_label_probs labels = labels.apply(lambda i: vectorizer(i, unique_labels)) audio_paths = [ os.path.join(audio_dir, url_to_filename(url)) for url in sampled_songs["Sample"] ] return np.array(audio_paths), np.stack(labels) class PipelinedDataset(Dataset): """ Adds a feature extractor preprocessing step to a dataset. """ def __init__(self, dataset, feature_extractor): self._data = dataset self.feature_extractor = feature_extractor def __len__(self): return len(self._data) def __getitem__(self, index): sample, label = self._data[index] features = self.feature_extractor(sample) return features, label class DanceDataModule(pl.LightningDataModule): def __init__( self, dataset: Dataset, test_proportion=0.15, val_proportion=0.1, target_classes: list[str] = None, batch_size: int = 64, num_workers=10, data_subset=None, ): super().__init__() self.val_proportion = val_proportion self.test_proportion = test_proportion self.train_proportion = 1.0 - test_proportion - val_proportion self.target_classes = target_classes self.batch_size = batch_size self.num_workers = num_workers if data_subset is not None and float(data_subset) != 1.0: dataset, _ = random_split(dataset, [data_subset, 1 - data_subset]) self.dataset = dataset def setup(self, stage: str): self.train_ds, self.val_ds, self.test_ds = random_split( self.dataset, [self.train_proportion, self.val_proportion, self.test_proportion], ) def train_dataloader(self): return DataLoader( self.train_ds, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False, ) def val_dataloader(self): return DataLoader( self.val_ds, batch_size=self.batch_size, num_workers=self.num_workers, ) def test_dataloader(self): return DataLoader( self.test_ds, batch_size=self.batch_size, num_workers=self.num_workers, ) def get_label_weights(self): dataset = ( self.dataset.dataset if isinstance(self.dataset, Subset) else self.dataset ) weights = [ds.song_dataset.get_label_weights() for ds in dataset._data.datasets] return torch.mean(torch.stack(weights), dim=0) # TODO: Make this weighted def find_mean_std(dataset: Dataset, zscore=1.96, moe=0.02, p=0.5): """ Estimates the mean and standard deviations of the a dataset. """ sample_size = int(np.ceil((zscore**2 * p * (1 - p)) / (moe**2))) sample_indices = np.random.choice( np.arange(len(dataset)), size=sample_size, replace=False ) mean = 0 std = 0 for i in sample_indices: features = dataset[i][0] mean += features.mean().item() std += features.std().item() print("std", std / sample_size) print("mean", mean / sample_size) def get_datasets(dataset_config: dict, feature_extractor) -> Dataset: datasets = [] for dataset_path, kwargs in dataset_config.items(): module_name, class_name = dataset_path.rsplit(".", 1) module = importlib.import_module(module_name) ProvidedDataset = getattr(module, class_name) datasets.append(ProvidedDataset(**kwargs)) return PipelinedDataset(ConcatDataset(datasets), feature_extractor) def get_class_counts(config: dict): # TODO: Figure out why music4dance has fractional labels dataset = get_datasets(config["datasets"], lambda x: x) counts = sum( np.sum( np.arange(len(config["dance_ids"])) == np.expand_dims(ds.song_dataset.dance_labels.argmax(1), 1), axis=0, ) for ds in dataset._data.datasets ) labels = sorted(config["dance_ids"]) return dict(zip(labels, counts)) def record_audio_durations(folder: str): """ Records a filename: duration mapping of all audio files in a folder to a json file. """ durations = {} music_files = iglob(os.path.join(folder, "**", "*.wav"), recursive=True) for file in music_files: meta = ta.info(file) durations[file] = meta.num_frames / meta.sample_rate with open(os.path.join(folder, "audio_durations.json"), "w") as f: json.dump(durations, f)