import os import torch import multiprocessing import pandas as pd import pytorch_lightning as pl from sklearn.model_selection import train_test_split from torch.utils.data import TensorDataset, DataLoader from imblearn.over_sampling import SMOTE class Preprocessor(pl.LightningDataModule): def __init__(self, batch_size): super(Preprocessor, self).__init__() self.dataset = pd.read_csv('dataset/simulated_electrical_grid.csv') self.batch_size = batch_size self.oversampling = SMOTE(random_state=42) def setup(self, stage=None): train_set, valid_set, test_set = self.preprocessor() if stage == "fit": self.train_set = train_set self.valid_set = valid_set elif stage == "test": self.test_set = test_set def train_dataloader(self): return DataLoader( dataset=self.train_set, batch_size=self.batch_size, shuffle=True, num_workers=multiprocessing.cpu_count() ) def val_dataloader(self): return DataLoader( dataset=self.valid_set, batch_size=self.batch_size, shuffle=False, num_workers=multiprocessing.cpu_count() ) def test_dataloader(self): return DataLoader( dataset=self.test_set, batch_size=self.batch_size, shuffle=False, num_workers=multiprocessing.cpu_count() ) def preprocessor(self): if os.path.exists("dataset/train_set.pt") and os.path.exists("dataset/valid_set.pt") and os.path.exists("dataset/test_set.pt"): print("\nLoading Data...") train_set = torch.load("dataset/train_set.pt") valid_set = torch.load("dataset/valid_set.pt") test_set = torch.load("dataset/test_set.pt") print('[ Loading Completed ]\n') else: print("\nPreprocessing Data...") train_set, valid_set, test_set = self.preprocessing_data(self.dataset) print('[ Preprocessing Completed ]\n') return train_set, valid_set, test_set def preprocessing_data(self, dataset): X = dataset[['tau1', 'tau2', 'tau3', 'tau4', 'p1', 'p2', 'p3', 'p4', 'g1', 'g2', 'g3', 'g4']] y = dataset['stabf'] X_train_res, y_train_res = self.oversampling.fit_resample(X, y) y_train_res = self.label_encoding(y_train_res) X_train_valid, X_test, y_train_valid, y_test = train_test_split(X_train_res, y_train_res, test_size=0.2, random_state=42) X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=0.1, random_state=42) X_train_tensor = torch.tensor(X_train.values.tolist()) y_train_tensor = torch.tensor(y_train.values.tolist()) X_valid_tensor = torch.tensor(X_valid.values.tolist()) y_valid_tensor = torch.tensor(y_valid.values.tolist()) X_test_tensor = torch.tensor(X_test.values.tolist()) y_test_tensor = torch.tensor(y_test.values.tolist()) train_set = TensorDataset(X_train_tensor, y_train_tensor) valid_set = TensorDataset(X_valid_tensor, y_valid_tensor) test_set = TensorDataset(X_test_tensor, y_test_tensor) torch.save(train_set, "dataset/train_set.pt") torch.save(valid_set, "dataset/valid_set.pt") torch.save(test_set, "dataset/test_set.pt") return train_set, valid_set, test_set def label_encoding(self, y_train): encoder = {'unstable': 0, 'stable': 1} y_train = y_train.astype('str').map(encoder) return y_train def get_feature_size(self): X = self.dataset[['tau1','tau2','tau3','tau4','p1', 'p2', 'p3', 'p4','g1','g2','g3','g4']] return 1, len(X.columns.tolist())