"""Functions for denoising data during preprocessing.""" from typing import Optional, Tuple, cast import numpy as np from numpy import ndarray def denoise( X: ndarray, y: ndarray, Xresampled: Optional[ndarray] = None, random_state: Optional[np.random.RandomState] = None, ) -> Tuple[ndarray, ndarray]: """Denoise the dataset using a Gaussian process.""" from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel gp_kernel = RBF(np.ones(X.shape[1])) + WhiteKernel(1e-1) + ConstantKernel() gpr = GaussianProcessRegressor( kernel=gp_kernel, n_restarts_optimizer=50, random_state=random_state ) gpr.fit(X, y) if Xresampled is not None: return Xresampled, cast(ndarray, gpr.predict(Xresampled)) return X, cast(ndarray, gpr.predict(X)) def multi_denoise( X: ndarray, y: ndarray, Xresampled: Optional[ndarray] = None, random_state: Optional[np.random.RandomState] = None, ): """Perform `denoise` along each column of `y` independently.""" y = np.stack( [ denoise(X, y[:, i], Xresampled=Xresampled, random_state=random_state)[1] for i in range(y.shape[1]) ], axis=1, ) if Xresampled is not None: return Xresampled, y return X, y