Spaces:

XavierSpycy
/

NumPyNMF-Showcase

Sleeping

File size: 16,598 Bytes

bd67cfe

import os
import csv
import logging
from typing import Union, List, Tuple, Generator

import numpy as np
import pandas as pd

from algorithm.datasets import load_data, get_image_size
from algorithm.preprocess import NoiseAdder, MinMaxScaler, StandardScaler
from algorithm.sample import random_sample
from algorithm.nmf import BasicNMF, L2NormNMF, KLDivergenceNMF, ISDivergenceNMF, L21NormNMF, HSCostNMF, L1NormRegularizedNMF, CappedNormNMF, CauchyNMF
from algorithm.user_evaluate import evaluate

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def summary(log_file_name: str) -> pd.DataFrame:
    """
    Parameter:
    log_file_name (str): The name of the log file to read.

    Return:
    result (pandas.DataFrame): The summary of the log file.
    """
    df = pd.read_csv(log_file_name)
    result = df.groupby(by=['dataset', 'noise_type', 'noise_level'])[['rmse', 'nmi', 'acc']].mean()
    return result

class BasicBlock(object):
    """
    Basic block for the pipeline.
    """
    def basic_info(self, nmf: Union[BasicNMF, str], dataset: str, scaler: str) -> Tuple[str, Union[MinMaxScaler, StandardScaler], BasicNMF]:
        """
        Get the basic information for the pipeline.

        Parameters:
        - nmf (Union[BasicNMF, str]): NMF algorithm to use.
        - dataset (str): Name of the dataset to use.
        - scaler (str): Name of the scaler to use.

        Returns:
        - folder (str): Folder of the dataset.
        - scaler (MinMaxScaler or StandardScaler): Scaler to use.
        - nmf (BasicNMF): NMF algorithm to use.
        """
        # Create mappings for the NMF algorithms, datasets, and scalers
        # Store NMF algorithms in a dictionary
        nmf_dict = {
                'L2NormNMF': L2NormNMF,
                'KLDivergenceNMF': KLDivergenceNMF,
                'ISDivergenceNMF': ISDivergenceNMF,
                'L21NormNMF': L21NormNMF,
                'HSCostNMF': HSCostNMF,
                'L1NormRegularizedNMF': L1NormRegularizedNMF,
                'CappedNormNMF': CappedNormNMF,
                'CauchyNMF': CauchyNMF
        }
        # Store datasets in a dictionary
        dataset_dict = {
                'ORL': 'data/ORL',
                'YaleB': 'data/CroppedYaleB'
        }
        # Store scalers in a dictionary
        scaler_dict = {
                'MinMax': MinMaxScaler(),
                'Standard': StandardScaler()
        }
        folder = dataset_dict.get(dataset, 'data/ORL')
        # Scale the data
        scaler = scaler_dict.get(scaler, MinMaxScaler())
        # Choose an NMF algorithm
        if isinstance(nmf, BasicNMF):
            nmf = nmf
        else:
             # Choose an NMF algorithm
            nmf = nmf_dict.get(nmf, L1NormRegularizedNMF)()
        return folder, scaler, nmf
    
    def load_data(self, folder: str, reduce: int=1, random_state: Union[int, np.random.RandomState, None]=None) -> Tuple[np.ndarray, np.ndarray, Tuple[int, int]]:
        """
        Load the data.

        Parameters:
        - folder (str): Folder of the dataset.
        - reduce (int): Factor by which the image size is reduced for visualization.
        - random_state (Union[int, np.random.RandomState, None]): Random state to use for sampling.

        Returns:
        - X_hat (np.ndarray): The data matrix.
        - Y_hat (np.ndarray): The label matrix.
        - img_size (Tuple[int, int]): Size of the images.
        """
        # Load ORL dataset
        X_hat, Y_hat = load_data(folder, reduce=reduce)
        # Randomly sample 90% of the data
        X_hat, Y_hat = random_sample(X_hat, Y_hat, 0.9, random_state=random_state)
        # Get the size of images
        img_size = get_image_size(folder)
        return X_hat, Y_hat, img_size
    
    def add_noise(self, X_hat: np.ndarray, noise_type: str, noise_level: float, random_state: Union[int, np.random.RandomState, None], reduce: int) -> np.ndarray:
        """
        Add noise to the data.

        Parameters:
        - X_hat (np.ndarray): The data matrix.
        - noise_type (str): Type of noise to add to the data.
        - noise_level (float): Level of noise to add to the data.
        - random_state (Union[int, np.random.RandomState, None]): Random state to use for adding noise.
        - reduce (int): Factor by which the image size is reduced for visualization.

        Returns:
        - X_noise (np.ndarray): The noisy data matrix.
        """
        # Set random state and noise adder
        noise_adder = NoiseAdder(random_state=random_state)
        # Create a dictionary of noise functions
        noise_dict = {
                'uniform': (noise_adder.add_uniform_noise, {'X_hat': X_hat, 'noise_level': noise_level}),
                'gaussian': (noise_adder.add_gaussian_noise, {'X_hat': X_hat, 'noise_level': noise_level}),
                'laplacian': (noise_adder.add_laplacian_noise, {'X_hat': X_hat, 'noise_level': noise_level}),
                'salt_and_pepper': (noise_adder.add_salt_and_pepper_noise, {'X_hat': X_hat, 'noise_level': noise_level}),
                'block': (noise_adder.add_block_noise, {'X_hat': X_hat, 'block_size': noise_level, 'img_width': self.img_size[0]//reduce})
        }
        # Map the noise type to the noise function
        noise_func, args = noise_dict.get(noise_type, (noise_adder.add_uniform_noise, {'X_hat': X_hat, 'noise_level': noise_level}))
        # Add noise to the data
        _, X_noise = noise_func(**args)
        return X_noise
    
    def scale(self, X_hat: np.ndarray, X_noise: np.ndarray, scaler: Union[MinMaxScaler, StandardScaler]) -> Tuple[np.ndarray, np.ndarray]:
        """
        Scale the data.

        Parameters:
        - X_hat (np.ndarray): The data matrix.
        - X_noise (np.ndarray): The noisy data matrix.
        - scaler (MinMaxScaler or StandardScaler): Scaler to use for scaling the data.

        Returns:
        - X_hat_scaled (np.ndarray): The scaled data matrix.
        - X_noise_scaled (np.ndarray): The scaled noisy data matrix.
        """
        # Scale the data
        X_hat_scaled = scaler.fit_transform(X_hat)
        X_noise_scaled = scaler.transform(X_noise)
        # Ensure that the scaled noisy data is non-negative
        X_noise_scaled += np.abs(np.min(X_noise_scaled)) * np.abs(np.min(X_noise_scaled)) * int(np.min(X_noise_scaled) < 0)
        return X_hat_scaled, X_noise_scaled

class Pipeline(BasicBlock):
    def __init__(self, nmf: Union[str, BasicNMF], dataset: str='ORL', reduce: int=1, noise_type: str='uniform', 
                 noise_level: float=0.02, random_state: int=3407, scaler: str='MinMax') -> None:
        """
        Initialize the pipeline.

        Parameters:
        - nmf (str or BasicNMF): Name of the NMF algorithm to use.
        - dataset (str): Name of the dataset to use.
        - reduce (int): Factor by which the image size is reduced for visualization.
        - noise_type (str): Type of noise to add to the data.
        - noise_level (float): Level of noise to add to the data.
        - random_state (int): Random state to use for the NMF algorithm.
        - scaler (str): Name of the scaler to use for scaling the data.

        Returns:
        None. The function will initialize the pipeline.
        """
        # Get the basic information for the pipeline
        folder, scaler, self.nmf = self.basic_info(nmf, dataset, scaler)
        # Load the data
        X_hat, self.__Y_hat, self.img_size = self.load_data(folder, reduce=reduce, random_state=random_state)
        # Add noise to the data
        X_noise = self.add_noise(X_hat, noise_type, noise_level, random_state, reduce)
        # Scale the data
        self.__X_hat_scaled, self.__X_noise_scaled = self.scale(X_hat, X_noise, scaler)
        self.reduce = reduce
        self.random_state = random_state
        # Delete the attributes that might occupy significant memory
        del X_hat, X_noise, folder, scaler, noise_type, noise_level, random_state, dataset, reduce, nmf

    def execute(self, max_iter: int, convergence_trend: bool=False, matrix_size: bool=False, verbose: bool=False) -> None:
        """
        Run the pipeline.

        Parameters:
        - max_iter (int): Maximum number of iterations to run the NMF algorithm.
        - convergence_trend (bool): Whether to display the convergence trend of the NMF algorithm.
        - matrix_size (bool): Whether to display the size of the basis and coefficient matrices.
        - verbose (bool): Whether to display the verbose output of the NMF algorithm.
        """
        # Run NMF
        self.nmf.fit(self.__X_noise_scaled, len(set(self.__Y_hat)), max_iter=max_iter, 
                     random_state=self.random_state, imshow=convergence_trend, verbose=verbose)
        # Get the dictionary and representation matrices
        self.D, self.R = self.nmf.D, self.nmf.R
        if matrix_size:
            print('D.shape={}, R.shape={}'.format(self.D.shape, self.R.shape))
        self.metrics = self.nmf.evaluate(self.__X_hat_scaled, self.__Y_hat, random_state=self.random_state)
        return self.metrics

    def evaluate(self, idx: int=2, imshow: bool=False) -> None:
        """
        Evaluate the NMF algorithm.

        Parameters:
        - idx (int): Index of the image to evaluate.
        - imshow (bool): Whether to display the images.
        """
        evaluate(self.nmf, self.metrics, self.__X_hat_scaled, self.__X_noise_scaled, 
                self.img_size, self.reduce, idx, imshow)

    def visualization(self, idx: int=2) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        """
        Visualize the NMF algorithm.

        Parameters:
        - idx (int): Index of the image to visualize.

        Returns:
        - X_i (np.ndarray): The original image.
        - X_noise_i (np.ndarray): The noisy image.
        - DR_i (np.ndarray): The reconstructed image.
        """
        DR = np.dot(self.D, self.R).reshape(self.__X_hat_scaled.shape[0], self.__X_hat_scaled.shape[1])
        # Calculate reduced image size based on the 'reduce' factor
        img_size = [i//self.reduce for i in self.img_size]
        # Retrieve the specified image from the data
        X_i = self.__X_hat_scaled[:,idx].reshape(img_size[1],img_size[0])
        X_noise_i = self.__X_noise_scaled[:,idx].reshape(img_size[1],img_size[0])
        DR_i = DR[:,idx].reshape(img_size[1],img_size[0])
        return X_i, X_noise_i, DR_i
    
    def cleanup(self) -> None:
        """
        Cleanup method to release resources and delete instances.
        """
        # Delete attributes that might occupy significant memory
        if hasattr(self, 'nmf'):
            del self.nmf, self.__X_hat_scaled, self.__X_noise_scaled, self.D, self.R, self.metrics

class Experiment:
    """
    Set up the experiment.
    """
    data_dirs = ['data/ORL', 'data/CroppedYaleB']
    data_container = [[], []]
    noises = {
        'uniform': [0.1, 0.3],
        'gaussian': [0.05, 0.08],
        'laplacian': [0.04, 0.06],
        'salt_and_pepper': [0.02, 0.1],
        'block': [10, 15],}
    
    nmf_dict = {
        'L2NormNMF': L2NormNMF,
        'KLDivergenceNMF': KLDivergenceNMF,
        'ISDivergenceNMF': ISDivergenceNMF,
        'L21NormNMF': L21NormNMF,
        'HSCostNMF': HSCostNMF,
        'L1NormRegularizedNMF': L1NormRegularizedNMF,
        'CappedNormNMF': CappedNormNMF,
        'CauchyNMF': CauchyNMF,}
    
    def __init__(self, 
                 seeds: List[int]=None) -> None:
        """
        Initialize the experiment.

        Parameters:
        - seeds (List[int]): Random seeds to use for the experiment.
        """
        self.seeds = [0, 42, 99, 512, 3407] if seeds is None else seeds

    def choose(self, nmf: Union[str, BasicNMF]) -> None:
        """
        Choose an NMF algorithm. Essentially, this method sets the NMF algorithm to use for the experiment.
        
        nmf (Union[str, BasicNMF]): NMF algorithm to use.
        """
        if isinstance(nmf, BasicNMF):
            self.nmf = nmf
        else:
             # Choose an NMF algorithm
            self.nmf = self.nmf_dict.get(nmf, L1NormRegularizedNMF)()

    def data_loader(self) -> Generator[Tuple[str, int, np.ndarray, np.ndarray, np.ndarray, str, float], None, None]:
        """
        Construct a generator to load the data.

        Returns:
        - data_file (str): Name of the dataset.
        - seed (int): Random seed to use for the experiment.
        - X_hat_scaled (np.ndarray): The scaled data matrix.
        - Y_hat (np.ndarray): The label matrix.
        - X_noise_scaled (np.ndarray): The scaled noisy data matrix.
        - noise_type (str): Type of noise to add to the data.
        - noise_level (float): Level of noise to add to the data.
        """
        scaler = MinMaxScaler()
        # Data file loop
        for data_file in self.data_dirs:
            reduce = 1 if data_file.endswith('ORL') else 3
            image_size = get_image_size(data_file)
            X_hat_, Y_hat_ = load_data(root=data_file, reduce=reduce)
            # Random seed loop
            for seed in self.seeds:
                noise_adder = NoiseAdder(random_state=seed)
                X_hat, Y_hat = random_sample(X_hat_, Y_hat_, 0.9, random_state=seed)
                X_hat_scaled = scaler.fit_transform(X_hat)
                # Noise type loop
                for noise_type in self.noises:
                    add_noise_ = getattr(noise_adder, f'add_{noise_type}_noise')
                    # Noise level loop
                    for noise_level in self.noises[noise_type]:
                        _, X_noise = add_noise_(X_hat, noise_level=noise_level) if noise_type != 'block' else add_noise_(X_hat, image_size[0]//reduce, noise_level)
                        X_noise_scaled = scaler.transform(X_noise)
                        X_noise_scaled += np.abs(np.min(X_noise_scaled)) * np.abs(np.min(X_noise_scaled)) * int(np.min(X_noise_scaled) < 0)
                        yield data_file.split("/")[-1], seed, X_hat_scaled, Y_hat, X_noise_scaled, noise_type, noise_level
    
    def sync_fit(self, dataset: str, seed: int, X_hat_scaled: np.ndarray, Y_hat: np.ndarray, X_noise_scaled: np.ndarray, noise_type: str, noise_level: float) -> Tuple[str, str, float, int, float, float, float]:
        """
        Fit the NMF algorithm on the dataset with noise synchronously.

        Parameters:
        - dataset (str): Name of the dataset.
        - seed (int): Random seed to use for the experiment.
        - X_hat_scaled (np.ndarray): The scaled data matrix.
        - Y_hat (np.ndarray): The label matrix.
        - X_noise_scaled (np.ndarray): The scaled noisy data matrix.
        - noise_type (str): Type of noise to add to the data.
        - noise_level (float): Level of noise to add to the data.

        Returns:
        - dataset (str): Name of the dataset.
        - noise_type (str): Type of noise to add to the data.
        - noise_level (float): Level of noise to add to the data.
        - seed (int): Random seed to use for the experiment.
        - rmse (float): Root mean squared error of the NMF algorithm.
        - acc (float): Accuracy of the NMF algorithm.
        - nmi (float): Normalized mutual information of the NMF algorithm.
        """
        self.nmf.fit(X_noise_scaled, len(set(Y_hat)), random_state=seed, verbose=False)
        # Display the current experiment information
        logging.info(f'Dataset: {dataset} Random seed: {seed} - Test on {noise_type} with {noise_level} ended.')
        return dataset, noise_type, noise_level, seed, *self.nmf.evaluate(X_hat_scaled, Y_hat, random_state=seed)
    
    def execute(self) -> None:
        """
        Execute the experiments.
        """
        # Lazy import to avoid multiprocessing error
        import multiprocessing
        results = []
        # Define the multiprocessing pool
        with multiprocessing.Pool(10) as pool:
            for result in pool.starmap(self.sync_fit, self.data_loader()):
                # Append the result to the list
                results.append(result)
        # Write the results to a csv file
        if not os.path.exists(f'{self.nmf.name}_log.csv'):
            mode = 'w'
        else:
            mode = 'a'
        with open(f'{self.nmf.name}_log.csv', mode) as f:
            writer = csv.writer(f)
            if mode == 'w': 
                writer.writerow(['dataset', 'noise_type', 'noise_level', 'seed', 'rmse', 'acc', 'nmi'])
            for result in results:
                writer.writerow(result)