Spaces:

tgd1115
/

neuro-orion-v1

Sleeping

File size: 8,045 Bytes
# Class for preprocessing the data before the training phase
import os
import pandas as pd
import numpy as np
import logging
from sklearn.preprocessing import StandardScaler
from adtk.data import validate_series, to_events
from adtk.detector import SeasonalAD
from path_config import DATA_DIR


class Preprocessor:

    def __init__(self):
        self.scaler = StandardScaler()
        self.raw_data = None
        self.anomalies_events = None
        self.logger = logging.getLogger(__name__)
        self.window_size = None

    def preprocess_data(self, file_path, val_split="2014-10-01", test_split="2014-10-16", window_size=48):
        """

        Preprocess the raw data



        :param window_size: The size of the sliding window, default is 48

        :param file_path: Path to the CSV file

        """
        # Load the raw data
        self.logger.info("Loading raw data...")
        self.load_raw_data(file_path)

        # Detect anomalies and convert them to events
        self.logger.info("Detecting anomalies...")
        self.load_anomalies_events()

        # Label the anomalies in the raw data
        self.logger.info("Labeling anomalies...")
        self._label_anomalies()

        # Split the data chronologically
        self.logger.info("Splitting the data into training, validation and testing set...")
        train_data, val_data, test_data = self._chronological_split(val_split=val_split, test_split=test_split)

        # Split the data into features and target
        self.logger.info("Splitting the data into features and target...")
        X_train, y_train = self._split_features_target(train_data)
        X_val, y_val = self._split_features_target(val_data)
        X_test, y_test = self._split_features_target(test_data)

        # Scale the data
        self.logger.info("Scaling the data...")
        train_scaled, val_scaled, test_scaled = self._scale_data(X_train, X_val, X_test)

        # Create a sliding window of data
        self.logger.info(f"Creating sliding window with the length of {window_size} from the data...")
        train_sequences = self._create_sliding_window(train_scaled, window_size=window_size)
        val_sequences = self._create_sliding_window(val_scaled, window_size=window_size)
        test_sequences = self._create_sliding_window(test_scaled, window_size=window_size)

        self.window_size = window_size

        # Save the preprocessed data
        self.logger.info("Saving the preprocessed data...")
        self.save_preprocessed_data(train_sequences, "train_features.npy")
        self.save_preprocessed_data(val_sequences, "val_features.npy")
        self.save_preprocessed_data(test_sequences, "test_features.npy")
        self.save_preprocessed_data(y_train.values, "train_labels.npy")
        self.save_preprocessed_data(y_val.values, "val_labels.npy")
        self.save_preprocessed_data(y_test.values, "test_labels.npy")

        print("Preprocessing completed!")

    def load_raw_data(self, file_path):
        """

        Load raw data from a CSV file



        :param file_path: Path to the CSV file

        """
        try:
            df = pd.read_csv(
                file_path,
                usecols=["timestamp", "value"],
                index_col="timestamp",
                parse_dates=True,
            )
            df.sort_index(inplace=True)

            # Rename the columns
            df.rename(columns={"value": "Traffic"}, inplace=True)
            df.index.rename("Timestamp", inplace=True)

            # Validate the time series
            self.raw_data = validate_series(df)

        except FileNotFoundError as e:
            print(f"File path does not exist: {file_path}")

    def load_anomalies_events(self):
        """

        Load the anomalies events

        """
        if self.raw_data is None:
            print("Raw data is not loaded")
            return

        events = [
            ('2014-07-04 00:00:00', '2014-07-06 23:59:59'),  # Independence Day Celebration
            ('2014-09-01 00:00:00', '2014-09-01 23:59:59'),  # Labour Day
            ('2014-11-02 00:00:00', '2014-11-02 11:59:59'),  # NYC Marathon 2014
            ('2014-11-27 00:00:00', '2014-11-27 23:59:59'),  # Thanksgiving Day
            ('2014-12-25 00:00:00', '2014-12-26 23:59:59'),  # Christmas Holiday
            ('2015-01-01 00:00:00', '2015-01-01 23:59:59'),  # New Year
            ('2015-01-26 12:00:00', '2015-01-28 11:59:59')  # Snowstorm
        ]

        # Store the events
        self.anomalies_events = events

    def _label_anomalies(self):
        """

        Label the anomalies in the raw data

        """
        if self.raw_data is None:
            print("Raw data is not loaded")
            return

        if self.anomalies_events is None:
            print("Anomalies are not detected")
            return

        # Label the anomalies as 1 and 0
        self.raw_data["Anomaly"] = 0
        for start, end in self.anomalies_events:
            self.raw_data.loc[start:end, "Anomaly"] = 1

    def _chronological_split(self, val_split="2014-10-01", test_split="2014-10-16"):
        """

        Split the data chronologically into train, validation, and test sets



        :param val_split: Validation split date

        :param test_split: Test split date

        """
        if self.raw_data is None:
            print("Raw data is not loaded")
            return

        # Split the data
        train_data = self.raw_data.loc[self.raw_data.index < val_split]
        val_data = self.raw_data.loc[
            (self.raw_data.index >= val_split) & (self.raw_data.index < test_split)
        ]
        test_data = self.raw_data.loc[self.raw_data.index >= test_split]

        return train_data, val_data, test_data

    def _split_features_target(self, data, target_col="Anomaly"):
        """

        Split the data into features and target



        :param data: DataFrame containing the data

        :param target_col: Column to predict

        """
        # Split the data into features and target
        X = data.drop(columns=[target_col])
        y = data[target_col]

        return X, y

    def _scale_data(self, train_data, val_data, test_data):
        """

        Scale the data using StandardScaler



        :param train_data: Training data

        :param val_data: Validation data

        :param test_data: Test data

        """
        if self.scaler is None:
            self.scaler = StandardScaler()

        # Fit and transform the training data
        train_scaled = self.scaler.fit_transform(train_data)
        val_scaled = self.scaler.transform(val_data)
        test_scaled = self.scaler.transform(test_data)

        return train_scaled, val_scaled, test_scaled

    def _create_sliding_window(self, data, window_size=48, step_size=1):
        """

        Create a sliding window of data



        :param data: Scaled data

        :param window_size: Size of the window

        :param step_size: Step size for the window

        """
        sequences = []

        for i in range(0, len(data) - window_size + 1, step_size):
            sequences.append(data[i : i + window_size])

        return np.array(sequences)

    def save_preprocessed_data(self, data, file_path):
        """

        Save preprocessed data to a .npy file



        :param data: Preprocessed data

        :param file_path: Path to save the .npy file

        """
        dir_path = os.path.join(DATA_DIR, "preprocessed_data")

        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

        file_path = os.path.join(dir_path, file_path)

        np.save(file_path, data)
        self.logger.info(f"{file_path} has been saved successfully!")

    def get_seq_length(self):
        """

        Get the length of the sequence

        """
        return self.window_size