|
import os |
|
import json |
|
import joblib |
|
import argparse |
|
import numpy as np |
|
import pandas as pd |
|
|
|
from sklearn.preprocessing import StandardScaler, MinMaxScaler |
|
|
|
from tensorflow.keras.models import Sequential |
|
from tensorflow.keras.layers import GRU, LSTM, Dense, Dropout |
|
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint |
|
|
|
|
|
|
|
from warnings import filterwarnings |
|
filterwarnings('ignore') |
|
|
|
class DataProcessor: |
|
def __init__(self, datasets_path): |
|
self.datasets_path = datasets_path |
|
self.datasets = self._get_datasets() |
|
|
|
def _get_datasets(self): |
|
return sorted([ |
|
item for item in os.listdir(self.datasets_path) |
|
if os.path.isfile(os.path.join(self.datasets_path, item)) and item.endswith('.csv') |
|
]) |
|
|
|
@staticmethod |
|
def create_sequences(df, sequence_length): |
|
labels, sequences = [], [] |
|
for i in range(len(df) - sequence_length): |
|
seq = df.iloc[i:i + sequence_length].values |
|
label = df.iloc[i + sequence_length].values[0] |
|
sequences.append(seq) |
|
labels.append(label) |
|
return np.array(sequences), np.array(labels) |
|
|
|
@staticmethod |
|
def preprocess_data(dataframe): |
|
for col in dataframe.columns: |
|
if dataframe[col].isnull().any(): |
|
if dataframe[col].dtype == 'object': |
|
dataframe[col].fillna(dataframe[col].mode()[0], inplace = True) |
|
else: |
|
dataframe[col].fillna(dataframe[col].mean(), inplace = True) |
|
return dataframe |
|
|
|
@staticmethod |
|
def scale_data(dataframe, scaler_cls): |
|
scaler = scaler_cls() |
|
dataframe['Close'] = scaler.fit_transform(dataframe[['Close']]) |
|
return scaler, dataframe |
|
|
|
|
|
class ModelBuilder: |
|
""" |
|
GRU (Gated Recurrent Units) Model |
|
""" |
|
@staticmethod |
|
def gru_model(input_shape): |
|
model = Sequential([ |
|
GRU(50, return_sequences = True, input_shape = input_shape), |
|
Dropout(0.2), |
|
|
|
GRU(50, return_sequences = True), |
|
Dropout(0.2), |
|
|
|
GRU(50, return_sequences = True), |
|
Dropout(0.2), |
|
|
|
GRU(50, return_sequences = False), |
|
Dropout(0.2), |
|
|
|
Dense(units = 1) |
|
]) |
|
model.compile(optimizer = 'nadam', loss = 'mean_squared_error') |
|
return model |
|
|
|
|
|
""" |
|
LSTM (Long Short-Term Memory) Model |
|
""" |
|
@staticmethod |
|
def lstm_model(input_shape): |
|
model = Sequential([ |
|
LSTM(50, return_sequences = True, input_shape = input_shape), |
|
Dropout(0.2), |
|
|
|
LSTM(50, return_sequences = True), |
|
Dropout(0.2), |
|
|
|
LSTM(50, return_sequences = True), |
|
Dropout(0.2), |
|
|
|
LSTM(50, return_sequences = False), |
|
Dropout(0.2), |
|
|
|
Dense(units = 1) |
|
]) |
|
model.compile(optimizer = 'nadam', loss = 'mean_squared_error') |
|
return model |
|
|
|
|
|
""" |
|
LSTM (Long Short-Term Memory) and |
|
GRU (Gated Recurrent Units) Model |
|
""" |
|
@staticmethod |
|
def lstm_gru_model(input_shape): |
|
model = Sequential([ |
|
LSTM(50, return_sequences = True, input_shape = input_shape), |
|
Dropout(0.2), |
|
|
|
GRU(50, return_sequences = True), |
|
Dropout(0.2), |
|
|
|
LSTM(50, return_sequences = True), |
|
Dropout(0.2), |
|
|
|
GRU(50, return_sequences = False), |
|
Dropout(0.2), |
|
|
|
Dense(units = 1) |
|
]) |
|
model.compile(optimizer = 'nadam', loss = 'mean_squared_error') |
|
return model |
|
|
|
|
|
class Trainer: |
|
def __init__(self, model, model_file, sequence_length, epochs, batch_size): |
|
self.model = model |
|
self.model_file = model_file |
|
self.sequence_length = sequence_length |
|
self.epochs = epochs |
|
self.batch_size = batch_size |
|
|
|
def train(self, X_train, y_train, X_test, y_test): |
|
early_stopping = EarlyStopping(monitor = 'val_loss', patience = 5, mode = 'min') |
|
|
|
model_checkpoint = ModelCheckpoint( |
|
filepath = self.model_file, |
|
save_best_only = True, |
|
monitor = 'val_loss', |
|
mode = 'min' |
|
) |
|
|
|
history = self.model.fit( |
|
X_train, y_train, |
|
epochs = self.epochs, |
|
batch_size = self.batch_size, |
|
validation_data = (X_test, y_test), |
|
callbacks = [early_stopping, model_checkpoint] |
|
) |
|
|
|
return history |
|
|
|
|
|
class PostProcessor: |
|
@staticmethod |
|
def inverse_transform(scaler, data): |
|
return scaler.inverse_transform(data) |
|
|
|
@staticmethod |
|
def save_json(filename, data): |
|
with open(filename, 'w') as f: |
|
json.dump(data, f) |
|
|
|
def main(algorithm: str, sequence_length: int, epochs: int, batch_size: int): |
|
datasets_path = './datasets' |
|
models_path = './models' |
|
posttrained = './posttrained' |
|
pickle_file = './pickles' |
|
|
|
data_processor = DataProcessor(datasets_path) |
|
|
|
for dataset in data_processor.datasets: |
|
print(f"[TRAINING] {dataset.replace('.csv', '')} ") |
|
|
|
dataframe = pd.read_csv(os.path.join(datasets_path, dataset), index_col='Date')[['Close']] |
|
model_file = os.path.join(models_path, f"{dataset.replace('.csv', '')}.keras") |
|
|
|
|
|
dataframe.dropna(inplace = True) |
|
standard_scaler, dataframe = data_processor.scale_data(dataframe, StandardScaler) |
|
minmax_scaler, dataframe = data_processor.scale_data(dataframe, MinMaxScaler) |
|
|
|
sequences, labels = data_processor.create_sequences(dataframe, sequence_length) |
|
input_shape = (sequences.shape[1], sequences.shape[2]) |
|
|
|
if algorithm == "GRU": |
|
model = ModelBuilder.gru_model(input_shape) |
|
|
|
elif algorithm == "LSTM": |
|
model = ModelBuilder.lstm_model(input_shape) |
|
|
|
elif algorithm == "LSTM_GRU": |
|
model = ModelBuilder.lstm_gru_model(input_shape) |
|
|
|
else: model = ModelBuilder.lstm_model(input_shape) |
|
|
|
train_size = int(len(sequences) * 0.8) |
|
X_train, X_test = sequences[:train_size], sequences[train_size:] |
|
y_train, y_test = labels[:train_size], labels[train_size:] |
|
|
|
trainer = Trainer(model, model_file, sequence_length, epochs, batch_size) |
|
trainer.train(X_train, y_train, X_test, y_test) |
|
|
|
dataframe_json = {'Date': dataframe.index.tolist(), 'Close': dataframe['Close'].tolist()} |
|
|
|
PostProcessor.save_json( |
|
os.path.join(posttrained, f'{dataset.replace(".csv", "")}-posttrained.json'), |
|
dataframe_json |
|
) |
|
|
|
joblib.dump(minmax_scaler, os.path.join(pickle_file, f'{dataset.replace(".csv", "")}_minmax_scaler.pickle')) |
|
joblib.dump(standard_scaler, os.path.join(pickle_file, f'{dataset.replace(".csv", "")}_standard_scaler.pickle')) |
|
|
|
model.load_weights(model_file) |
|
model.save(model_file) |
|
|
|
print("\n\n") |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser(description = "Tebakaja Model Trainer") |
|
|
|
parser.add_argument('-a', '--algorithm', type = str, required = True, |
|
help = 'select the algorithm to be trained (LSTM, GRU, LSTM_GRU)') |
|
|
|
parser.add_argument('-e', '--epochs', type = int, required = True, help = 'epochs') |
|
parser.add_argument('-b', '--batchs', type = int, required = True, help = 'batch length') |
|
parser.add_argument('-s', '--sequences', type = int, required = True, help = 'sequences length') |
|
|
|
args = parser.parse_args() |
|
|
|
main( |
|
epochs = args.epochs, |
|
batch_size = args.batchs, |
|
algorithm = args.algorithm, |
|
sequence_length = args.sequences |
|
) |
|
|
|
|