import os import csv import logging from typing import Union, List, Tuple, Generator import numpy as np import pandas as pd from algorithm.datasets import load_data, get_image_size from algorithm.preprocess import NoiseAdder, MinMaxScaler, StandardScaler from algorithm.sample import random_sample from algorithm.nmf import BasicNMF, L2NormNMF, KLDivergenceNMF, ISDivergenceNMF, L21NormNMF, HSCostNMF, L1NormRegularizedNMF, CappedNormNMF, CauchyNMF from algorithm.user_evaluate import evaluate logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') def summary(log_file_name: str) -> pd.DataFrame: """ Parameter: log_file_name (str): The name of the log file to read. Return: result (pandas.DataFrame): The summary of the log file. """ df = pd.read_csv(log_file_name) result = df.groupby(by=['dataset', 'noise_type', 'noise_level'])[['rmse', 'nmi', 'acc']].mean() return result class BasicBlock(object): """ Basic block for the pipeline. """ def basic_info(self, nmf: Union[BasicNMF, str], dataset: str, scaler: str) -> Tuple[str, Union[MinMaxScaler, StandardScaler], BasicNMF]: """ Get the basic information for the pipeline. Parameters: - nmf (Union[BasicNMF, str]): NMF algorithm to use. - dataset (str): Name of the dataset to use. - scaler (str): Name of the scaler to use. Returns: - folder (str): Folder of the dataset. - scaler (MinMaxScaler or StandardScaler): Scaler to use. - nmf (BasicNMF): NMF algorithm to use. """ # Create mappings for the NMF algorithms, datasets, and scalers # Store NMF algorithms in a dictionary nmf_dict = { 'L2NormNMF': L2NormNMF, 'KLDivergenceNMF': KLDivergenceNMF, 'ISDivergenceNMF': ISDivergenceNMF, 'L21NormNMF': L21NormNMF, 'HSCostNMF': HSCostNMF, 'L1NormRegularizedNMF': L1NormRegularizedNMF, 'CappedNormNMF': CappedNormNMF, 'CauchyNMF': CauchyNMF } # Store datasets in a dictionary dataset_dict = { 'ORL': 'data/ORL', 'YaleB': 'data/CroppedYaleB' } # Store scalers in a dictionary scaler_dict = { 'MinMax': MinMaxScaler(), 'Standard': StandardScaler() } folder = dataset_dict.get(dataset, 'data/ORL') # Scale the data scaler = scaler_dict.get(scaler, MinMaxScaler()) # Choose an NMF algorithm if isinstance(nmf, BasicNMF): nmf = nmf else: # Choose an NMF algorithm nmf = nmf_dict.get(nmf, L1NormRegularizedNMF)() return folder, scaler, nmf def load_data(self, folder: str, reduce: int=1, random_state: Union[int, np.random.RandomState, None]=None) -> Tuple[np.ndarray, np.ndarray, Tuple[int, int]]: """ Load the data. Parameters: - folder (str): Folder of the dataset. - reduce (int): Factor by which the image size is reduced for visualization. - random_state (Union[int, np.random.RandomState, None]): Random state to use for sampling. Returns: - X_hat (np.ndarray): The data matrix. - Y_hat (np.ndarray): The label matrix. - img_size (Tuple[int, int]): Size of the images. """ # Load ORL dataset X_hat, Y_hat = load_data(folder, reduce=reduce) # Randomly sample 90% of the data X_hat, Y_hat = random_sample(X_hat, Y_hat, 0.9, random_state=random_state) # Get the size of images img_size = get_image_size(folder) return X_hat, Y_hat, img_size def add_noise(self, X_hat: np.ndarray, noise_type: str, noise_level: float, random_state: Union[int, np.random.RandomState, None], reduce: int) -> np.ndarray: """ Add noise to the data. Parameters: - X_hat (np.ndarray): The data matrix. - noise_type (str): Type of noise to add to the data. - noise_level (float): Level of noise to add to the data. - random_state (Union[int, np.random.RandomState, None]): Random state to use for adding noise. - reduce (int): Factor by which the image size is reduced for visualization. Returns: - X_noise (np.ndarray): The noisy data matrix. """ # Set random state and noise adder noise_adder = NoiseAdder(random_state=random_state) # Create a dictionary of noise functions noise_dict = { 'uniform': (noise_adder.add_uniform_noise, {'X_hat': X_hat, 'noise_level': noise_level}), 'gaussian': (noise_adder.add_gaussian_noise, {'X_hat': X_hat, 'noise_level': noise_level}), 'laplacian': (noise_adder.add_laplacian_noise, {'X_hat': X_hat, 'noise_level': noise_level}), 'salt_and_pepper': (noise_adder.add_salt_and_pepper_noise, {'X_hat': X_hat, 'noise_level': noise_level}), 'block': (noise_adder.add_block_noise, {'X_hat': X_hat, 'block_size': noise_level, 'img_width': self.img_size[0]//reduce}) } # Map the noise type to the noise function noise_func, args = noise_dict.get(noise_type, (noise_adder.add_uniform_noise, {'X_hat': X_hat, 'noise_level': noise_level})) # Add noise to the data _, X_noise = noise_func(**args) return X_noise def scale(self, X_hat: np.ndarray, X_noise: np.ndarray, scaler: Union[MinMaxScaler, StandardScaler]) -> Tuple[np.ndarray, np.ndarray]: """ Scale the data. Parameters: - X_hat (np.ndarray): The data matrix. - X_noise (np.ndarray): The noisy data matrix. - scaler (MinMaxScaler or StandardScaler): Scaler to use for scaling the data. Returns: - X_hat_scaled (np.ndarray): The scaled data matrix. - X_noise_scaled (np.ndarray): The scaled noisy data matrix. """ # Scale the data X_hat_scaled = scaler.fit_transform(X_hat) X_noise_scaled = scaler.transform(X_noise) # Ensure that the scaled noisy data is non-negative X_noise_scaled += np.abs(np.min(X_noise_scaled)) * np.abs(np.min(X_noise_scaled)) * int(np.min(X_noise_scaled) < 0) return X_hat_scaled, X_noise_scaled class Pipeline(BasicBlock): def __init__(self, nmf: Union[str, BasicNMF], dataset: str='ORL', reduce: int=1, noise_type: str='uniform', noise_level: float=0.02, random_state: int=3407, scaler: str='MinMax') -> None: """ Initialize the pipeline. Parameters: - nmf (str or BasicNMF): Name of the NMF algorithm to use. - dataset (str): Name of the dataset to use. - reduce (int): Factor by which the image size is reduced for visualization. - noise_type (str): Type of noise to add to the data. - noise_level (float): Level of noise to add to the data. - random_state (int): Random state to use for the NMF algorithm. - scaler (str): Name of the scaler to use for scaling the data. Returns: None. The function will initialize the pipeline. """ # Get the basic information for the pipeline folder, scaler, self.nmf = self.basic_info(nmf, dataset, scaler) # Load the data X_hat, self.__Y_hat, self.img_size = self.load_data(folder, reduce=reduce, random_state=random_state) # Add noise to the data X_noise = self.add_noise(X_hat, noise_type, noise_level, random_state, reduce) # Scale the data self.__X_hat_scaled, self.__X_noise_scaled = self.scale(X_hat, X_noise, scaler) self.reduce = reduce self.random_state = random_state # Delete the attributes that might occupy significant memory del X_hat, X_noise, folder, scaler, noise_type, noise_level, random_state, dataset, reduce, nmf def execute(self, max_iter: int, convergence_trend: bool=False, matrix_size: bool=False, verbose: bool=False) -> None: """ Run the pipeline. Parameters: - max_iter (int): Maximum number of iterations to run the NMF algorithm. - convergence_trend (bool): Whether to display the convergence trend of the NMF algorithm. - matrix_size (bool): Whether to display the size of the basis and coefficient matrices. - verbose (bool): Whether to display the verbose output of the NMF algorithm. """ # Run NMF self.nmf.fit(self.__X_noise_scaled, len(set(self.__Y_hat)), max_iter=max_iter, random_state=self.random_state, imshow=convergence_trend, verbose=verbose) # Get the dictionary and representation matrices self.D, self.R = self.nmf.D, self.nmf.R if matrix_size: print('D.shape={}, R.shape={}'.format(self.D.shape, self.R.shape)) self.metrics = self.nmf.evaluate(self.__X_hat_scaled, self.__Y_hat, random_state=self.random_state) return self.metrics def evaluate(self, idx: int=2, imshow: bool=False) -> None: """ Evaluate the NMF algorithm. Parameters: - idx (int): Index of the image to evaluate. - imshow (bool): Whether to display the images. """ evaluate(self.nmf, self.metrics, self.__X_hat_scaled, self.__X_noise_scaled, self.img_size, self.reduce, idx, imshow) def visualization(self, idx: int=2) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """ Visualize the NMF algorithm. Parameters: - idx (int): Index of the image to visualize. Returns: - X_i (np.ndarray): The original image. - X_noise_i (np.ndarray): The noisy image. - DR_i (np.ndarray): The reconstructed image. """ DR = np.dot(self.D, self.R).reshape(self.__X_hat_scaled.shape[0], self.__X_hat_scaled.shape[1]) # Calculate reduced image size based on the 'reduce' factor img_size = [i//self.reduce for i in self.img_size] # Retrieve the specified image from the data X_i = self.__X_hat_scaled[:,idx].reshape(img_size[1],img_size[0]) X_noise_i = self.__X_noise_scaled[:,idx].reshape(img_size[1],img_size[0]) DR_i = DR[:,idx].reshape(img_size[1],img_size[0]) return X_i, X_noise_i, DR_i def cleanup(self) -> None: """ Cleanup method to release resources and delete instances. """ # Delete attributes that might occupy significant memory if hasattr(self, 'nmf'): del self.nmf, self.__X_hat_scaled, self.__X_noise_scaled, self.D, self.R, self.metrics class Experiment: """ Set up the experiment. """ data_dirs = ['data/ORL', 'data/CroppedYaleB'] data_container = [[], []] noises = { 'uniform': [0.1, 0.3], 'gaussian': [0.05, 0.08], 'laplacian': [0.04, 0.06], 'salt_and_pepper': [0.02, 0.1], 'block': [10, 15],} nmf_dict = { 'L2NormNMF': L2NormNMF, 'KLDivergenceNMF': KLDivergenceNMF, 'ISDivergenceNMF': ISDivergenceNMF, 'L21NormNMF': L21NormNMF, 'HSCostNMF': HSCostNMF, 'L1NormRegularizedNMF': L1NormRegularizedNMF, 'CappedNormNMF': CappedNormNMF, 'CauchyNMF': CauchyNMF,} def __init__(self, seeds: List[int]=None) -> None: """ Initialize the experiment. Parameters: - seeds (List[int]): Random seeds to use for the experiment. """ self.seeds = [0, 42, 99, 512, 3407] if seeds is None else seeds def choose(self, nmf: Union[str, BasicNMF]) -> None: """ Choose an NMF algorithm. Essentially, this method sets the NMF algorithm to use for the experiment. nmf (Union[str, BasicNMF]): NMF algorithm to use. """ if isinstance(nmf, BasicNMF): self.nmf = nmf else: # Choose an NMF algorithm self.nmf = self.nmf_dict.get(nmf, L1NormRegularizedNMF)() def data_loader(self) -> Generator[Tuple[str, int, np.ndarray, np.ndarray, np.ndarray, str, float], None, None]: """ Construct a generator to load the data. Returns: - data_file (str): Name of the dataset. - seed (int): Random seed to use for the experiment. - X_hat_scaled (np.ndarray): The scaled data matrix. - Y_hat (np.ndarray): The label matrix. - X_noise_scaled (np.ndarray): The scaled noisy data matrix. - noise_type (str): Type of noise to add to the data. - noise_level (float): Level of noise to add to the data. """ scaler = MinMaxScaler() # Data file loop for data_file in self.data_dirs: reduce = 1 if data_file.endswith('ORL') else 3 image_size = get_image_size(data_file) X_hat_, Y_hat_ = load_data(root=data_file, reduce=reduce) # Random seed loop for seed in self.seeds: noise_adder = NoiseAdder(random_state=seed) X_hat, Y_hat = random_sample(X_hat_, Y_hat_, 0.9, random_state=seed) X_hat_scaled = scaler.fit_transform(X_hat) # Noise type loop for noise_type in self.noises: add_noise_ = getattr(noise_adder, f'add_{noise_type}_noise') # Noise level loop for noise_level in self.noises[noise_type]: _, X_noise = add_noise_(X_hat, noise_level=noise_level) if noise_type != 'block' else add_noise_(X_hat, image_size[0]//reduce, noise_level) X_noise_scaled = scaler.transform(X_noise) X_noise_scaled += np.abs(np.min(X_noise_scaled)) * np.abs(np.min(X_noise_scaled)) * int(np.min(X_noise_scaled) < 0) yield data_file.split("/")[-1], seed, X_hat_scaled, Y_hat, X_noise_scaled, noise_type, noise_level def sync_fit(self, dataset: str, seed: int, X_hat_scaled: np.ndarray, Y_hat: np.ndarray, X_noise_scaled: np.ndarray, noise_type: str, noise_level: float) -> Tuple[str, str, float, int, float, float, float]: """ Fit the NMF algorithm on the dataset with noise synchronously. Parameters: - dataset (str): Name of the dataset. - seed (int): Random seed to use for the experiment. - X_hat_scaled (np.ndarray): The scaled data matrix. - Y_hat (np.ndarray): The label matrix. - X_noise_scaled (np.ndarray): The scaled noisy data matrix. - noise_type (str): Type of noise to add to the data. - noise_level (float): Level of noise to add to the data. Returns: - dataset (str): Name of the dataset. - noise_type (str): Type of noise to add to the data. - noise_level (float): Level of noise to add to the data. - seed (int): Random seed to use for the experiment. - rmse (float): Root mean squared error of the NMF algorithm. - acc (float): Accuracy of the NMF algorithm. - nmi (float): Normalized mutual information of the NMF algorithm. """ self.nmf.fit(X_noise_scaled, len(set(Y_hat)), random_state=seed, verbose=False) # Display the current experiment information logging.info(f'Dataset: {dataset} Random seed: {seed} - Test on {noise_type} with {noise_level} ended.') return dataset, noise_type, noise_level, seed, *self.nmf.evaluate(X_hat_scaled, Y_hat, random_state=seed) def execute(self) -> None: """ Execute the experiments. """ # Lazy import to avoid multiprocessing error import multiprocessing results = [] # Define the multiprocessing pool with multiprocessing.Pool(10) as pool: for result in pool.starmap(self.sync_fit, self.data_loader()): # Append the result to the list results.append(result) # Write the results to a csv file if not os.path.exists(f'{self.nmf.name}_log.csv'): mode = 'w' else: mode = 'a' with open(f'{self.nmf.name}_log.csv', mode) as f: writer = csv.writer(f) if mode == 'w': writer.writerow(['dataset', 'noise_type', 'noise_level', 'seed', 'rmse', 'acc', 'nmi']) for result in results: writer.writerow(result)