File size: 3,865 Bytes
74b29ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import os
import torch
import multiprocessing
import pandas as pd
import pytorch_lightning as pl

from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from imblearn.over_sampling import SMOTE


class Preprocessor(pl.LightningDataModule):
    def __init__(self, batch_size):
        super(Preprocessor, self).__init__()    
        self.dataset = pd.read_csv('dataset/simulated_electrical_grid.csv')
        self.batch_size = batch_size
        self.oversampling = SMOTE(random_state=42)

    def setup(self, stage=None):
        train_set, valid_set, test_set = self.preprocessor()   
        if stage == "fit":
            self.train_set = train_set
            self.valid_set = valid_set
        elif stage == "test":
            self.test_set = test_set

    def train_dataloader(self):
        return DataLoader(
            dataset=self.train_set,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=multiprocessing.cpu_count()
        )
    
    def val_dataloader(self):
        return DataLoader(
            dataset=self.valid_set,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=multiprocessing.cpu_count()
        )

    def test_dataloader(self):
        return DataLoader(
            dataset=self.test_set,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=multiprocessing.cpu_count()
        )

    def preprocessor(self):
        if os.path.exists("dataset/train_set.pt") and os.path.exists("dataset/valid_set.pt") and os.path.exists("dataset/test_set.pt"):
            print("\nLoading Data...")
            train_set = torch.load("dataset/train_set.pt")
            valid_set = torch.load("dataset/valid_set.pt")
            test_set = torch.load("dataset/test_set.pt")
            print('[ Loading Completed ]\n')

        else:
            print("\nPreprocessing Data...")
            train_set, valid_set, test_set = self.preprocessing_data(self.dataset)
            print('[ Preprocessing Completed ]\n')

        return train_set, valid_set, test_set
    
    def preprocessing_data(self, dataset):
        X = dataset[['tau1', 'tau2', 'tau3', 'tau4', 'p1', 'p2', 'p3', 'p4', 'g1', 'g2', 'g3', 'g4']]
        y = dataset['stabf']

        X_train_res, y_train_res = self.oversampling.fit_resample(X, y)
        y_train_res = self.label_encoding(y_train_res)

        X_train_valid, X_test, y_train_valid, y_test = train_test_split(X_train_res, y_train_res, test_size=0.2, random_state=42)
        X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=0.1, random_state=42)
    
        X_train_tensor = torch.tensor(X_train.values.tolist())
        y_train_tensor = torch.tensor(y_train.values.tolist())
        
        X_valid_tensor = torch.tensor(X_valid.values.tolist())
        y_valid_tensor = torch.tensor(y_valid.values.tolist())
        
        X_test_tensor = torch.tensor(X_test.values.tolist())
        y_test_tensor = torch.tensor(y_test.values.tolist())

        train_set = TensorDataset(X_train_tensor, y_train_tensor)
        valid_set = TensorDataset(X_valid_tensor, y_valid_tensor)
        test_set = TensorDataset(X_test_tensor, y_test_tensor)
    
        torch.save(train_set, "dataset/train_set.pt")
        torch.save(valid_set, "dataset/valid_set.pt")
        torch.save(test_set, "dataset/test_set.pt")

        return train_set, valid_set, test_set

    def label_encoding(self, y_train):
        encoder = {'unstable': 0, 'stable': 1}
        y_train = y_train.astype('str').map(encoder)

        return y_train

    def get_feature_size(self):
        X = self.dataset[['tau1','tau2','tau3','tau4','p1', 'p2', 'p3', 'p4','g1','g2','g3','g4']]
        
        return 1, len(X.columns.tolist())