Spaces:
Sleeping
Sleeping
# Class for preprocessing the data before the training phase | |
import os | |
import pandas as pd | |
import numpy as np | |
import logging | |
from sklearn.preprocessing import StandardScaler | |
from adtk.data import validate_series, to_events | |
from adtk.detector import SeasonalAD | |
from path_config import DATA_DIR | |
class Preprocessor: | |
def __init__(self): | |
self.scaler = StandardScaler() | |
self.raw_data = None | |
self.anomalies_events = None | |
self.logger = logging.getLogger(__name__) | |
self.window_size = None | |
def preprocess_data(self, file_path, val_split="2014-10-01", test_split="2014-10-16", window_size=48): | |
""" | |
Preprocess the raw data | |
:param window_size: The size of the sliding window, default is 48 | |
:param file_path: Path to the CSV file | |
""" | |
# Load the raw data | |
self.logger.info("Loading raw data...") | |
self.load_raw_data(file_path) | |
# Detect anomalies and convert them to events | |
self.logger.info("Detecting anomalies...") | |
self.load_anomalies_events() | |
# Label the anomalies in the raw data | |
self.logger.info("Labeling anomalies...") | |
self._label_anomalies() | |
# Split the data chronologically | |
self.logger.info("Splitting the data into training, validation and testing set...") | |
train_data, val_data, test_data = self._chronological_split(val_split=val_split, test_split=test_split) | |
# Split the data into features and target | |
self.logger.info("Splitting the data into features and target...") | |
X_train, y_train = self._split_features_target(train_data) | |
X_val, y_val = self._split_features_target(val_data) | |
X_test, y_test = self._split_features_target(test_data) | |
# Scale the data | |
self.logger.info("Scaling the data...") | |
train_scaled, val_scaled, test_scaled = self._scale_data(X_train, X_val, X_test) | |
# Create a sliding window of data | |
self.logger.info(f"Creating sliding window with the length of {window_size} from the data...") | |
train_sequences = self._create_sliding_window(train_scaled, window_size=window_size) | |
val_sequences = self._create_sliding_window(val_scaled, window_size=window_size) | |
test_sequences = self._create_sliding_window(test_scaled, window_size=window_size) | |
self.window_size = window_size | |
# Save the preprocessed data | |
self.logger.info("Saving the preprocessed data...") | |
self.save_preprocessed_data(train_sequences, "train_features.npy") | |
self.save_preprocessed_data(val_sequences, "val_features.npy") | |
self.save_preprocessed_data(test_sequences, "test_features.npy") | |
self.save_preprocessed_data(y_train.values, "train_labels.npy") | |
self.save_preprocessed_data(y_val.values, "val_labels.npy") | |
self.save_preprocessed_data(y_test.values, "test_labels.npy") | |
print("Preprocessing completed!") | |
def load_raw_data(self, file_path): | |
""" | |
Load raw data from a CSV file | |
:param file_path: Path to the CSV file | |
""" | |
try: | |
df = pd.read_csv( | |
file_path, | |
usecols=["timestamp", "value"], | |
index_col="timestamp", | |
parse_dates=True, | |
) | |
df.sort_index(inplace=True) | |
# Rename the columns | |
df.rename(columns={"value": "Traffic"}, inplace=True) | |
df.index.rename("Timestamp", inplace=True) | |
# Validate the time series | |
self.raw_data = validate_series(df) | |
except FileNotFoundError as e: | |
print(f"File path does not exist: {file_path}") | |
def load_anomalies_events(self): | |
""" | |
Load the anomalies events | |
""" | |
if self.raw_data is None: | |
print("Raw data is not loaded") | |
return | |
events = [ | |
('2014-07-04 00:00:00', '2014-07-06 23:59:59'), # Independence Day Celebration | |
('2014-09-01 00:00:00', '2014-09-01 23:59:59'), # Labour Day | |
('2014-11-02 00:00:00', '2014-11-02 11:59:59'), # NYC Marathon 2014 | |
('2014-11-27 00:00:00', '2014-11-27 23:59:59'), # Thanksgiving Day | |
('2014-12-25 00:00:00', '2014-12-26 23:59:59'), # Christmas Holiday | |
('2015-01-01 00:00:00', '2015-01-01 23:59:59'), # New Year | |
('2015-01-26 12:00:00', '2015-01-28 11:59:59') # Snowstorm | |
] | |
# Store the events | |
self.anomalies_events = events | |
def _label_anomalies(self): | |
""" | |
Label the anomalies in the raw data | |
""" | |
if self.raw_data is None: | |
print("Raw data is not loaded") | |
return | |
if self.anomalies_events is None: | |
print("Anomalies are not detected") | |
return | |
# Label the anomalies as 1 and 0 | |
self.raw_data["Anomaly"] = 0 | |
for start, end in self.anomalies_events: | |
self.raw_data.loc[start:end, "Anomaly"] = 1 | |
def _chronological_split(self, val_split="2014-10-01", test_split="2014-10-16"): | |
""" | |
Split the data chronologically into train, validation, and test sets | |
:param val_split: Validation split date | |
:param test_split: Test split date | |
""" | |
if self.raw_data is None: | |
print("Raw data is not loaded") | |
return | |
# Split the data | |
train_data = self.raw_data.loc[self.raw_data.index < val_split] | |
val_data = self.raw_data.loc[ | |
(self.raw_data.index >= val_split) & (self.raw_data.index < test_split) | |
] | |
test_data = self.raw_data.loc[self.raw_data.index >= test_split] | |
return train_data, val_data, test_data | |
def _split_features_target(self, data, target_col="Anomaly"): | |
""" | |
Split the data into features and target | |
:param data: DataFrame containing the data | |
:param target_col: Column to predict | |
""" | |
# Split the data into features and target | |
X = data.drop(columns=[target_col]) | |
y = data[target_col] | |
return X, y | |
def _scale_data(self, train_data, val_data, test_data): | |
""" | |
Scale the data using StandardScaler | |
:param train_data: Training data | |
:param val_data: Validation data | |
:param test_data: Test data | |
""" | |
if self.scaler is None: | |
self.scaler = StandardScaler() | |
# Fit and transform the training data | |
train_scaled = self.scaler.fit_transform(train_data) | |
val_scaled = self.scaler.transform(val_data) | |
test_scaled = self.scaler.transform(test_data) | |
return train_scaled, val_scaled, test_scaled | |
def _create_sliding_window(self, data, window_size=48, step_size=1): | |
""" | |
Create a sliding window of data | |
:param data: Scaled data | |
:param window_size: Size of the window | |
:param step_size: Step size for the window | |
""" | |
sequences = [] | |
for i in range(0, len(data) - window_size + 1, step_size): | |
sequences.append(data[i : i + window_size]) | |
return np.array(sequences) | |
def save_preprocessed_data(self, data, file_path): | |
""" | |
Save preprocessed data to a .npy file | |
:param data: Preprocessed data | |
:param file_path: Path to save the .npy file | |
""" | |
dir_path = os.path.join(DATA_DIR, "preprocessed_data") | |
if not os.path.exists(dir_path): | |
os.makedirs(dir_path) | |
file_path = os.path.join(dir_path, file_path) | |
np.save(file_path, data) | |
self.logger.info(f"{file_path} has been saved successfully!") | |
def get_seq_length(self): | |
""" | |
Get the length of the sequence | |
""" | |
return self.window_size |