File size: 19,009 Bytes
75ae889
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
import numpy as np
import pandas as pd
from pytorch_forecasting import TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer




class Energy_DataLoader:
    """
    A class for loading and preparing energy consumption data for modeling.

    Parameters:
        path (str): The path to the data file.
        test_dataset_size (int): The size of the test dataset. Defaults to 24.
        max_prediction_length (int): The maximum prediction length. Defaults to 24.
        max_encoder_length (int): The maximum encoder length. Defaults to 168.

    Methods:
        load_data(): Loads the energy consumption data from a CSV file.
        data_transformation(data): Performs data transformation and preprocessing.
        lead(df, lead): Computes the lead of the power usage time series for each consumer.
        lag(df, lag): Computes the lag of the power usage time series for each consumer.
        select_chunk(data): Selects a subset of the data corresponding to the top 10 consumers.
        time_features(df): Extracts time-based features from the data.
        data_split(df): Splits the data into training and test datasets.
        tft_data(): Prepares the data for training with the Temporal Fusion Transformer (TFT) model.
        fb_data(): Prepares the data for training with the Facebook Prophet model.
    """
    def __init__(self,path:str,test_dataset_size:int=24,
                 max_prediction_length:int=24,
                 max_encoder_length:int=168):
        """
        Initialize the Energy_DataLoader class.

        Parameters:
            path (str): The path to the data file.
            test_dataset_size (int): The size of the test dataset. Defaults to 24.
            max_prediction_length (int): The maximum prediction length. Defaults to 24.
            max_encoder_length (int): The maximum encoder length. Defaults to 168.
        """
        self.path=path
        self.test_dataset_size=test_dataset_size
        self.max_prediction_length=max_prediction_length
        self.max_encoder_length=max_encoder_length

    def load_data(self):
        """
        Load the energy consumption data from a CSV file.

        Returns:
            data (pandas.DataFrame): The loaded data.
        """
        try:
            data = pd.read_csv(self.path, index_col=0, sep=';', decimal=',')
            print('Load the data sucessfully.')
            return data
        except:
            print("Load the Data Again")
        
    def data_transformation(self,data:pd.DataFrame):
        """
        Perform data transformation and preprocessing.

        Parameters:
            data (pandas.DataFrame): The input data.

        Returns:
            data (pandas.DataFrame): The transformed data.
        """
        data.index = pd.to_datetime(data.index)
        data.sort_index(inplace=True)
        # resample the data into hr
        data = data.resample('1h').mean().replace(0., np.nan)
        new_data=data.reset_index()
        new_data['year']=new_data['index'].dt.year
        data1=new_data.loc[(new_data['year']!=2011)]
        data1=data1.set_index('index')
        data1=data1.drop(['year'],axis=1)
        return data1
    
    def lead(self,df:pd.DataFrame,lead:int=-1):
        """
        Compute the lead of the power usage time series for each consumer.

        Parameters:
            df (pandas.DataFrame): The input dataframe.
            lead (int): The lead time period. Defaults to -1.

        Returns:
            d_lead (pandas.Series): The lead time series.
        """
        d_lead=df.groupby('consumer_id')['power_usage'].shift(lead)
        return d_lead
    
    def lag(self,df:pd.DataFrame,lag:int=1):
        """
        Compute the lag of the power usage time series for each consumer.

        Parameters:
            df (pandas.DataFrame): The input dataframe.
            lag (int): The lag time period. Defaults to 1.

        Returns:
            d_lag (pandas.Series): The lag time series.
        """
        d_lag=df.groupby('consumer_id')['power_usage'].shift(lag)
        return d_lag


    def select_chunk(self,data:pd.DataFrame):
        """
        Select a subset of the data corresponding to the top 10 consumers.

        Parameters:
            data (pandas.DataFrame): The input data.

        Returns:
            df (pandas.DataFrame): The selected chunk of data.
        """
        top_10_consumer=data.columns[:10]
        # select Chuck of data intially
        # df=data[['MT_002','MT_004','MT_005','MT_006','MT_008' ]]
        df=data[top_10_consumer]    
        return df


    def time_features(self,df:pd.DataFrame):
        """
        Extract time-based features from the data.

        Parameters:
            df (pandas.DataFrame): The input data.

        Returns:
            time_df (pandas.DataFrame): The dataframe with time-based features.
            earliest_time (pandas.Timestamp): The earliest timestamp in the data.
        """
        earliest_time = df.index.min()
        print(earliest_time)
        df_list = []
        for label in df:
            print()
            ts = df[label]

            start_date = min(ts.fillna(method='ffill').dropna().index)
            end_date = max(ts.fillna(method='bfill').dropna().index)
        #     print(start_date)
        #     print(end_date)
            active_range = (ts.index >= start_date) & (ts.index <= end_date)
            ts = ts[active_range].fillna(0.)

            tmp = pd.DataFrame({'power_usage': ts})
            date = tmp.index

            tmp['hours_from_start'] = (date - earliest_time).seconds / 60 / 60 + (date - earliest_time).days * 24
            tmp['hours_from_start'] = tmp['hours_from_start'].astype('int')
        
            tmp['days_from_start'] = (date - earliest_time).days
            tmp['date'] = date
            tmp['consumer_id'] = label
            tmp['hour'] = date.hour
            tmp['day'] = date.day
            tmp['day_of_week'] = date.dayofweek
            tmp['month'] = date.month

            #stack all time series vertically
            df_list.append(tmp)

        time_df = pd.concat(df_list).reset_index(drop=True)

        lead_1=self.lead(time_df)
        time_df['Lead_1']=lead_1
        lag_1=self.lag(time_df,lag=1)
        time_df['lag_1']=lag_1
        lag_5=self.lag(time_df,lag=5)
        time_df['lag_5']=lag_5
        time_df=time_df.dropna()
        return time_df,earliest_time
    
    def data_split(self,df:pd.DataFrame):
        """
        Split the data into training and test datasets.

        Parameters:
            df (pandas.DataFrame): The input data.

        Returns:
            train_dataset (pandas.DataFrame): The training dataset.
            test_dataset (pandas.DataFrame): The test dataset.
            training (TimeSeriesDataSet): The training dataset for modeling.
            validation (TimeSeriesDataSet): The validation dataset for modeling.
        """
         ## Train dataset >> train + validation
        train_dataset=df.loc[df['date']<df.date.unique()[-self.test_dataset_size:][0]]

        ## Test Dataset
        test_dataset=df.loc[df['date']>=df.date.unique()[-self.test_dataset_size:][0]]

        # training stop cut off
        training_cutoff = train_dataset["hours_from_start"].max() - self.max_prediction_length
        print('training cutoff ::',training_cutoff)
        training = TimeSeriesDataSet(
            train_dataset[lambda x: x.hours_from_start <= training_cutoff],
            time_idx="hours_from_start",
            target="Lead_1",
            group_ids=["consumer_id"],
            min_encoder_length=self.max_encoder_length // 2, 
            max_encoder_length=self.max_encoder_length,
            min_prediction_length=1,
            max_prediction_length=self.max_prediction_length,
            static_categoricals=["consumer_id"],
            time_varying_known_reals=['power_usage',"hours_from_start","day","day_of_week", 
                                      "month", 'hour','lag_1','lag_5'],
            time_varying_unknown_reals=['Lead_1'],
            target_normalizer=GroupNormalizer(
                groups=["consumer_id"], transformation="softplus"  # softplus: Apply softplus to output (inverse transformation) and #inverse softplus to input,we normalize by group
            ),  
            add_relative_time_idx=True, # if to add a relative time index as feature (i.e. for each sampled sequence, the index will range from -encoder_length to prediction_length)
            add_target_scales=True,#  if to add scales for target to static real features (i.e. add the center and scale of the unnormalized timeseries as features)
            add_encoder_length=True, #  if to add decoder length to list of static real variables. True if min_encoder_length != max_encoder_length
        #     lags={"power_usage":[12,24]}
        )


        validation = TimeSeriesDataSet.from_dataset(training, train_dataset, predict=True, stop_randomization=True)

        # create dataloaders for  our model
        batch_size = 32
        # if you have a strong GPU, feel free to increase the number of workers  
        train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
        val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=0)
        return train_dataset,test_dataset,training,validation    
    
    def tft_data(self):
        """
        Prepare the data for training with the Temporal Fusion Transformer (TFT) model.

        Returns:
            train_dataset (pandas.DataFrame): The training dataset.
            test_dataset (pandas.DataFrame): The test dataset.
            training (TimeSeriesDataSet): The training dataset for modeling.
            validation (TimeSeriesDataSet): The validation dataset for modeling.
            earliest_time (pandas.Timestamp): The earliest timestamp in the data.
        """
        df=self.load_data()
        df=self.data_transformation(df)
        df=self.select_chunk(df)
        df,earliest_time=self.time_features(df)
        train_dataset,test_dataset,training,validation =self.data_split(df)
        return train_dataset,test_dataset,training,validation,earliest_time
    
    def fb_data(self):
        """
        Prepare the data for training with the Facebook Prophet model.

        Returns:
            train_data (pandas.DataFrame): The training dataset.
            test_data (pandas.DataFrame): The test dataset.
            consumer_dummay (pandas.Index): The consumer ID columns.
        """
        df=self.load_data()
        df=self.data_transformation(df)
        df=self.select_chunk(df)
        df,earliest_time=self.time_features(df)
        consumer_dummay=pd.get_dummies(df['consumer_id'])
        ## add encoded column into main
        df[consumer_dummay.columns]=consumer_dummay
        updated_df=df.drop(['consumer_id','hours_from_start','days_from_start'],axis=1)
        updated_df=updated_df.rename({'date':'ds',"Lead_1":'y'},axis=1)

        ## Train dataset >> train + validation
        train_data=updated_df.loc[updated_df['ds']<updated_df.ds.unique()[-self.test_dataset_size:][0]]

        ## Test Dataset
        test_data=updated_df.loc[updated_df['ds']>=updated_df.ds.unique()[-self.test_dataset_size:][0]]

        return train_data,test_data,consumer_dummay.columns
    


#-------------------------------------------------------------------------------------    
class StoreDataLoader:
    def __init__(self,path):
        self.path=path
    def load_data(self):
            try:
                data = pd.read_csv(self.path)
                data['date']= pd.to_datetime(data['date'])
                items=[i for i in range(1,11)]
                data=data.loc[(data['store']==1) & (data['item'].isin(items))]
                # data['date']=data['date'].dt.date
                print('Load the data sucessfully.')
                return data
            except:
                print("Load the Data Again")

    def create_week_date_featues(self,df,date_column):

            df['Month'] = pd.to_datetime(df[date_column]).dt.month

            df['Day'] = pd.to_datetime(df[date_column]).dt.day

            df['Dayofweek'] = pd.to_datetime(df[date_column]).dt.dayofweek

            df['DayOfyear'] = pd.to_datetime(df[date_column]).dt.dayofyear

            df['Week'] = pd.to_datetime(df[date_column]).dt.week

            df['Quarter'] = pd.to_datetime(df[date_column]).dt.quarter

            df['Is_month_start'] = np.where(pd.to_datetime(df[date_column]).dt.is_month_start,0,1)

            df['Is_month_end'] = np.where(pd.to_datetime(df[date_column]).dt.is_month_end,0,1)

            df['Is_quarter_start'] = np.where(pd.to_datetime(df[date_column]).dt.is_quarter_start,0,1)

            df['Is_quarter_end'] = np.where(pd.to_datetime(df[date_column]).dt.is_quarter_end,0,1)

            df['Is_year_start'] = np.where(pd.to_datetime(df[date_column]).dt.is_year_start,0,1)

            df['Is_year_end'] = np.where(pd.to_datetime(df[date_column]).dt.is_year_end,0,1)

            df['Semester'] = np.where(df[date_column].isin([1,2]),1,2)

            df['Is_weekend'] = np.where(df[date_column].isin([5,6]),1,0)

            df['Is_weekday'] = np.where(df[date_column].isin([0,1,2,3,4]),1,0)

            df['Days_in_month'] = pd.to_datetime(df[date_column]).dt.days_in_month

            return df

    def lead(self,df,lead=-1):
        d_lead=df.groupby(['store','item'])['sales'].shift(lead)
        return d_lead
    def lag(self,df,lag=1):
        d_lag=df.groupby(['store','item'])['sales'].shift(lag)
        return d_lag
    
    def time_features(self,df):
            earliest_time = df['date'].min()
            print(earliest_time)

            df['hours_from_start'] = (df['date'] - earliest_time).dt.seconds / 60 / 60 + (df['date'] - earliest_time).dt.days * 24
            df['hours_from_start'] = df['hours_from_start'].astype('int')

            df['days_from_start'] = (df['date'] - earliest_time).dt.days
            # new_weather_data['date'] = date
            # new_weather_data['consumer_id'] = label
            
            df=self.create_week_date_featues(df,'date')


            # change dtypes of store
            df['store']=df['store'].astype('str')
            df['item']=df['item'].astype('str')
            df['sales']=df['sales'].astype('float')

            
            df["log_sales"] = np.log(df.sales + 1e-8)
            df["avg_demand_by_store"] = df.groupby(["days_from_start", "store"], observed=True).sales.transform("mean")
            df["avg_demand_by_item"] = df.groupby(["days_from_start", "item"], observed=True).sales.transform("mean")
            # items=[str(i) for i in range(1,11)]
            # df=df.loc[(df['store']=='1') & (df['item'].isin(items))]
            # df=df.reset_index(drop=True)
            d_1=self.lead(df)
            df['Lead_1']=d_1
            d_lag1=self.lag(df,lag=1)
            df['lag_1']=d_lag1
            d_lag5=self.lag(df,lag=5)
            df['lag_5']=d_lag5
            df=df.dropna()
            return df,earliest_time
    
    def split_data(self,df,test_dataset_size=30,max_prediction_length=30,max_encoder_length=120):
        # df=self.load_data()
        # df,earliest_time=self.time_features(df)
        ## Train dataset >> train + validation
        train_dataset=df.loc[df['date']<df.date.unique()[-test_dataset_size:][0]]

        ## Test Dataset
        test_dataset=df.loc[df['date']>=df.date.unique()[-test_dataset_size:][0]]


        training_cutoff = train_dataset["days_from_start"].max() - max_prediction_length
        print("Training cutoff point ::",training_cutoff)

        training = TimeSeriesDataSet(
            train_dataset[lambda x: x.days_from_start <= training_cutoff],
            time_idx="days_from_start",
            target="Lead_1", ## target use as lead
            group_ids=['store','item'],
            min_encoder_length=max_encoder_length // 2,
            max_encoder_length=max_encoder_length,
            min_prediction_length=1,
            max_prediction_length=max_prediction_length,
            static_categoricals=["store",'item'],
            static_reals=[],
            time_varying_known_categoricals=[],

            time_varying_known_reals=["days_from_start","Day", "Month","Dayofweek","DayOfyear","Days_in_month",'Week', 'Quarter',
            'Is_month_start', 'Is_month_end', 'Is_quarter_start', 'Is_quarter_end',
            'Is_year_start', 'Is_year_end', 'Semester', 'Is_weekend', 'Is_weekday','Dayofweek', 'DayOfyear','lag_1','lag_5','sales'],

            time_varying_unknown_reals=['Lead_1','log_sales','avg_demand_by_store','avg_demand_by_item'],

            target_normalizer=GroupNormalizer(
                groups=["store","item"], transformation="softplus"
            ),  # we normalize by group
            add_relative_time_idx=True,
            add_target_scales=True,
            add_encoder_length=True, #
            allow_missing_timesteps=True,

        )


        validation = TimeSeriesDataSet.from_dataset(training, train_dataset, predict=True, stop_randomization=True)

        # create dataloaders for  our model
        batch_size = 32
        # if you have a strong GPU, feel free to increase the number of workers
        train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
        val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=0)
        return train_dataset,test_dataset,training,validation
    
    def tft_data(self):
         df=self.load_data()
         df,earliest_time=self.time_features(df)
         train_dataset,test_dataset,training,validation=self.split_data(df)
         return train_dataset,test_dataset,training,validation,earliest_time
    
    def fb_data(self,test_dataset_size=30):
        df=self.load_data()
        df,earliest_time=self.time_features(df)
        store_dummay=pd.get_dummies(df['store'],prefix='store')
        # store_dummay.head()

        item_dummay=pd.get_dummies(df['item'],prefix='item')
        # item_dummay.head()

        df_encode=pd.concat([store_dummay,item_dummay],axis=1)
        # df_encode.head()
        ## add encoded column into main
        df[df_encode.columns]=df_encode
        df=df.drop(['store','item','log_sales','avg_demand_by_store','avg_demand_by_item'],axis=1)
        df=df.rename({'date':'ds',"Lead_1":'y'},axis=1)
        fb_train_data = df.loc[df['ds'] <= '2017-11-30']
        fb_test_data = df.loc[df['ds'] > '2017-11-30'] 
        # fb_train_data=df.loc[df['ds']<df.ds.unique()[-test_dataset_size:][0]]
        # fb_test_data=df.loc[df['ds']>=df.ds.unique()[-test_dataset_size:][0]]
        
        return fb_train_data,fb_test_data,item_dummay,store_dummay
            
    
if __name__=='__main__':
    obj=Energy_DataLoader(r'D:\Ai Practices\Transformer Based Forecasting\stremlit app\LD2011_2014.txt')
    obj.load()