neuro-orion-v1 / src /pipeline /preprocesser.py
tgd1115's picture
Upload 12 files
8474315 verified
# Class for preprocessing the data before the training phase
import os
import pandas as pd
import numpy as np
import logging
from sklearn.preprocessing import StandardScaler
from adtk.data import validate_series, to_events
from adtk.detector import SeasonalAD
from path_config import DATA_DIR
class Preprocessor:
def __init__(self):
self.scaler = StandardScaler()
self.raw_data = None
self.anomalies_events = None
self.logger = logging.getLogger(__name__)
self.window_size = None
def preprocess_data(self, file_path, val_split="2014-10-01", test_split="2014-10-16", window_size=48):
"""
Preprocess the raw data
:param window_size: The size of the sliding window, default is 48
:param file_path: Path to the CSV file
"""
# Load the raw data
self.logger.info("Loading raw data...")
self.load_raw_data(file_path)
# Detect anomalies and convert them to events
self.logger.info("Detecting anomalies...")
self.load_anomalies_events()
# Label the anomalies in the raw data
self.logger.info("Labeling anomalies...")
self._label_anomalies()
# Split the data chronologically
self.logger.info("Splitting the data into training, validation and testing set...")
train_data, val_data, test_data = self._chronological_split(val_split=val_split, test_split=test_split)
# Split the data into features and target
self.logger.info("Splitting the data into features and target...")
X_train, y_train = self._split_features_target(train_data)
X_val, y_val = self._split_features_target(val_data)
X_test, y_test = self._split_features_target(test_data)
# Scale the data
self.logger.info("Scaling the data...")
train_scaled, val_scaled, test_scaled = self._scale_data(X_train, X_val, X_test)
# Create a sliding window of data
self.logger.info(f"Creating sliding window with the length of {window_size} from the data...")
train_sequences = self._create_sliding_window(train_scaled, window_size=window_size)
val_sequences = self._create_sliding_window(val_scaled, window_size=window_size)
test_sequences = self._create_sliding_window(test_scaled, window_size=window_size)
self.window_size = window_size
# Save the preprocessed data
self.logger.info("Saving the preprocessed data...")
self.save_preprocessed_data(train_sequences, "train_features.npy")
self.save_preprocessed_data(val_sequences, "val_features.npy")
self.save_preprocessed_data(test_sequences, "test_features.npy")
self.save_preprocessed_data(y_train.values, "train_labels.npy")
self.save_preprocessed_data(y_val.values, "val_labels.npy")
self.save_preprocessed_data(y_test.values, "test_labels.npy")
print("Preprocessing completed!")
def load_raw_data(self, file_path):
"""
Load raw data from a CSV file
:param file_path: Path to the CSV file
"""
try:
df = pd.read_csv(
file_path,
usecols=["timestamp", "value"],
index_col="timestamp",
parse_dates=True,
)
df.sort_index(inplace=True)
# Rename the columns
df.rename(columns={"value": "Traffic"}, inplace=True)
df.index.rename("Timestamp", inplace=True)
# Validate the time series
self.raw_data = validate_series(df)
except FileNotFoundError as e:
print(f"File path does not exist: {file_path}")
def load_anomalies_events(self):
"""
Load the anomalies events
"""
if self.raw_data is None:
print("Raw data is not loaded")
return
events = [
('2014-07-04 00:00:00', '2014-07-06 23:59:59'), # Independence Day Celebration
('2014-09-01 00:00:00', '2014-09-01 23:59:59'), # Labour Day
('2014-11-02 00:00:00', '2014-11-02 11:59:59'), # NYC Marathon 2014
('2014-11-27 00:00:00', '2014-11-27 23:59:59'), # Thanksgiving Day
('2014-12-25 00:00:00', '2014-12-26 23:59:59'), # Christmas Holiday
('2015-01-01 00:00:00', '2015-01-01 23:59:59'), # New Year
('2015-01-26 12:00:00', '2015-01-28 11:59:59') # Snowstorm
]
# Store the events
self.anomalies_events = events
def _label_anomalies(self):
"""
Label the anomalies in the raw data
"""
if self.raw_data is None:
print("Raw data is not loaded")
return
if self.anomalies_events is None:
print("Anomalies are not detected")
return
# Label the anomalies as 1 and 0
self.raw_data["Anomaly"] = 0
for start, end in self.anomalies_events:
self.raw_data.loc[start:end, "Anomaly"] = 1
def _chronological_split(self, val_split="2014-10-01", test_split="2014-10-16"):
"""
Split the data chronologically into train, validation, and test sets
:param val_split: Validation split date
:param test_split: Test split date
"""
if self.raw_data is None:
print("Raw data is not loaded")
return
# Split the data
train_data = self.raw_data.loc[self.raw_data.index < val_split]
val_data = self.raw_data.loc[
(self.raw_data.index >= val_split) & (self.raw_data.index < test_split)
]
test_data = self.raw_data.loc[self.raw_data.index >= test_split]
return train_data, val_data, test_data
def _split_features_target(self, data, target_col="Anomaly"):
"""
Split the data into features and target
:param data: DataFrame containing the data
:param target_col: Column to predict
"""
# Split the data into features and target
X = data.drop(columns=[target_col])
y = data[target_col]
return X, y
def _scale_data(self, train_data, val_data, test_data):
"""
Scale the data using StandardScaler
:param train_data: Training data
:param val_data: Validation data
:param test_data: Test data
"""
if self.scaler is None:
self.scaler = StandardScaler()
# Fit and transform the training data
train_scaled = self.scaler.fit_transform(train_data)
val_scaled = self.scaler.transform(val_data)
test_scaled = self.scaler.transform(test_data)
return train_scaled, val_scaled, test_scaled
def _create_sliding_window(self, data, window_size=48, step_size=1):
"""
Create a sliding window of data
:param data: Scaled data
:param window_size: Size of the window
:param step_size: Step size for the window
"""
sequences = []
for i in range(0, len(data) - window_size + 1, step_size):
sequences.append(data[i : i + window_size])
return np.array(sequences)
def save_preprocessed_data(self, data, file_path):
"""
Save preprocessed data to a .npy file
:param data: Preprocessed data
:param file_path: Path to save the .npy file
"""
dir_path = os.path.join(DATA_DIR, "preprocessed_data")
if not os.path.exists(dir_path):
os.makedirs(dir_path)
file_path = os.path.join(dir_path, file_path)
np.save(file_path, data)
self.logger.info(f"{file_path} has been saved successfully!")
def get_seq_length(self):
"""
Get the length of the sequence
"""
return self.window_size