# Class for preprocessing the data before the training phase import os import pandas as pd import numpy as np import logging from sklearn.preprocessing import StandardScaler from adtk.data import validate_series, to_events from adtk.detector import SeasonalAD from path_config import DATA_DIR class Preprocessor: def __init__(self): self.scaler = StandardScaler() self.raw_data = None self.anomalies_events = None self.logger = logging.getLogger(__name__) self.window_size = None def preprocess_data(self, file_path, val_split="2014-10-01", test_split="2014-10-16", window_size=48): """ Preprocess the raw data :param window_size: The size of the sliding window, default is 48 :param file_path: Path to the CSV file """ # Load the raw data self.logger.info("Loading raw data...") self.load_raw_data(file_path) # Detect anomalies and convert them to events self.logger.info("Detecting anomalies...") self.load_anomalies_events() # Label the anomalies in the raw data self.logger.info("Labeling anomalies...") self._label_anomalies() # Split the data chronologically self.logger.info("Splitting the data into training, validation and testing set...") train_data, val_data, test_data = self._chronological_split(val_split=val_split, test_split=test_split) # Split the data into features and target self.logger.info("Splitting the data into features and target...") X_train, y_train = self._split_features_target(train_data) X_val, y_val = self._split_features_target(val_data) X_test, y_test = self._split_features_target(test_data) # Scale the data self.logger.info("Scaling the data...") train_scaled, val_scaled, test_scaled = self._scale_data(X_train, X_val, X_test) # Create a sliding window of data self.logger.info(f"Creating sliding window with the length of {window_size} from the data...") train_sequences = self._create_sliding_window(train_scaled, window_size=window_size) val_sequences = self._create_sliding_window(val_scaled, window_size=window_size) test_sequences = self._create_sliding_window(test_scaled, window_size=window_size) self.window_size = window_size # Save the preprocessed data self.logger.info("Saving the preprocessed data...") self.save_preprocessed_data(train_sequences, "train_features.npy") self.save_preprocessed_data(val_sequences, "val_features.npy") self.save_preprocessed_data(test_sequences, "test_features.npy") self.save_preprocessed_data(y_train.values, "train_labels.npy") self.save_preprocessed_data(y_val.values, "val_labels.npy") self.save_preprocessed_data(y_test.values, "test_labels.npy") print("Preprocessing completed!") def load_raw_data(self, file_path): """ Load raw data from a CSV file :param file_path: Path to the CSV file """ try: df = pd.read_csv( file_path, usecols=["timestamp", "value"], index_col="timestamp", parse_dates=True, ) df.sort_index(inplace=True) # Rename the columns df.rename(columns={"value": "Traffic"}, inplace=True) df.index.rename("Timestamp", inplace=True) # Validate the time series self.raw_data = validate_series(df) except FileNotFoundError as e: print(f"File path does not exist: {file_path}") def load_anomalies_events(self): """ Load the anomalies events """ if self.raw_data is None: print("Raw data is not loaded") return events = [ ('2014-07-04 00:00:00', '2014-07-06 23:59:59'), # Independence Day Celebration ('2014-09-01 00:00:00', '2014-09-01 23:59:59'), # Labour Day ('2014-11-02 00:00:00', '2014-11-02 11:59:59'), # NYC Marathon 2014 ('2014-11-27 00:00:00', '2014-11-27 23:59:59'), # Thanksgiving Day ('2014-12-25 00:00:00', '2014-12-26 23:59:59'), # Christmas Holiday ('2015-01-01 00:00:00', '2015-01-01 23:59:59'), # New Year ('2015-01-26 12:00:00', '2015-01-28 11:59:59') # Snowstorm ] # Store the events self.anomalies_events = events def _label_anomalies(self): """ Label the anomalies in the raw data """ if self.raw_data is None: print("Raw data is not loaded") return if self.anomalies_events is None: print("Anomalies are not detected") return # Label the anomalies as 1 and 0 self.raw_data["Anomaly"] = 0 for start, end in self.anomalies_events: self.raw_data.loc[start:end, "Anomaly"] = 1 def _chronological_split(self, val_split="2014-10-01", test_split="2014-10-16"): """ Split the data chronologically into train, validation, and test sets :param val_split: Validation split date :param test_split: Test split date """ if self.raw_data is None: print("Raw data is not loaded") return # Split the data train_data = self.raw_data.loc[self.raw_data.index < val_split] val_data = self.raw_data.loc[ (self.raw_data.index >= val_split) & (self.raw_data.index < test_split) ] test_data = self.raw_data.loc[self.raw_data.index >= test_split] return train_data, val_data, test_data def _split_features_target(self, data, target_col="Anomaly"): """ Split the data into features and target :param data: DataFrame containing the data :param target_col: Column to predict """ # Split the data into features and target X = data.drop(columns=[target_col]) y = data[target_col] return X, y def _scale_data(self, train_data, val_data, test_data): """ Scale the data using StandardScaler :param train_data: Training data :param val_data: Validation data :param test_data: Test data """ if self.scaler is None: self.scaler = StandardScaler() # Fit and transform the training data train_scaled = self.scaler.fit_transform(train_data) val_scaled = self.scaler.transform(val_data) test_scaled = self.scaler.transform(test_data) return train_scaled, val_scaled, test_scaled def _create_sliding_window(self, data, window_size=48, step_size=1): """ Create a sliding window of data :param data: Scaled data :param window_size: Size of the window :param step_size: Step size for the window """ sequences = [] for i in range(0, len(data) - window_size + 1, step_size): sequences.append(data[i : i + window_size]) return np.array(sequences) def save_preprocessed_data(self, data, file_path): """ Save preprocessed data to a .npy file :param data: Preprocessed data :param file_path: Path to save the .npy file """ dir_path = os.path.join(DATA_DIR, "preprocessed_data") if not os.path.exists(dir_path): os.makedirs(dir_path) file_path = os.path.join(dir_path, file_path) np.save(file_path, data) self.logger.info(f"{file_path} has been saved successfully!") def get_seq_length(self): """ Get the length of the sequence """ return self.window_size