import time from typing import Text import lightgbm as lgb import matplotlib.pyplot as plt import numpy as np import xgboost as xgb from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.linear_model import LogisticRegression from sklearn.metrics import ( accuracy_score, roc_auc_score, cohen_kappa_score, plot_confusion_matrix, roc_curve, classification_report, ) from sklearn.model_selection import GridSearchCV from sklearn.neural_network import MLPClassifier from sklearn.tree import DecisionTreeClassifier from configs.constants import SUPPORT_MODEL, DEFAULT_MODEL from ml.data_prepare import data_preparing, create_dataset def plot_roc_cur(fper, tper): """ PLot the ROC :param fper: :param tper: """ plt.plot(fper, tper, color="orange", label="ROC") plt.plot([0, 1], [0, 1], color="darkblue", linestyle="--") plt.xlabel("False Positive Rate") plt.ylabel("True Positive Rate") plt.title("Receiver Operating Characteristic (ROC) Curve") plt.legend() plt.show() class MLModel: """ WC predictor model """ def __init__(self, model_type: Text): assert ( model_type in SUPPORT_MODEL ), "Not support the kind of model. Please choose one of {}".format( SUPPORT_MODEL ) self.model_type = model_type if self.model_type == "LogisticRegression": self.model = self.get_logistic_regression_model() elif self.model_type == "DecisionTreeClassifier": self.model = self.get_decision_tree_model() elif self.model_type == "MLPClassifier": self.model = self.get_neural_network_model() elif self.model_type == "RandomForestClassifier": self.model = self.get_random_forest_model() elif self.model_type == "GradientBoostingClassifier": self.model = self.get_gradient_boosting_model() elif self.model_type == "LGBMClassifier": self.model = self.get_light_gbm_model() elif self.model_type == "XGBClassifier": self.model = self.get_xgboost_model() def predict_proba(self, x): """ Call predict_proba on the estimator with the best found parameters. :return: """ return self.model.predict_proba(x) @staticmethod def __run_model(model, x_train, y_train, x_test, y_test, verbose=True): t0 = time.time() if verbose is False: model.fit(x_train.values, np.ravel(y_train), verbose=0) else: model.fit(x_train.values, np.ravel(y_train)) model = model.best_estimator_ y_pred = model.predict(x_test) accuracy = accuracy_score(y_test.values, y_pred) roc_auc = roc_auc_score(y_test, model.predict_proba(x_test.values)[:, 1]) coh_kap = cohen_kappa_score(y_test, y_pred) time_taken = time.time() - t0 print("Accuracy : {}".format(accuracy)) print("ROC Area under Curve : {}".format(roc_auc)) print("Cohen's Kappa : {}".format(coh_kap)) print("Time taken : {}".format(time_taken)) print(classification_report(y_test, y_pred, digits=5)) return model, accuracy, roc_auc, coh_kap, time_taken @staticmethod def get_logistic_regression_model(**params_lr): """ Return a logistic regression model :return: """ if not all(params_lr.values()): params_lr = { "C": np.logspace(-3, 3, 7), "penalty": ["l1", "l2"], "solver": "liblinear", } model_lr = LogisticRegression() model_lr = GridSearchCV( model_lr, params_lr, cv=3, verbose=False, scoring="roc_auc", refit=True ) return model_lr @staticmethod def get_decision_tree_model(**params): """ Return a decision tree model :return: """ if not all(params.values()): params = { "max_features": ["auto", "sqrt", "log2"], "ccp_alpha": [0.1, 0.01, 0.001], "max_depth": [5, 6, 7, 8, 9], "criterion": ["gini", "entropy"], } model = DecisionTreeClassifier() model = GridSearchCV( estimator=model, param_grid=params, cv=3, verbose=False, scoring="roc_auc", refit=True, ) return model @staticmethod def get_neural_network_model(**params_nn): """ Return a neutral network model :return: """ if not all(params_nn.values()): params_nn = { "solver": ["lbfgs"], "max_iter": [ 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, ], "alpha": 10.0 ** -np.arange(1, 10), "hidden_layer_sizes": np.arange(10, 15), "random_state": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], } model_nn = MLPClassifier() model_nn = GridSearchCV( model_nn, params_nn, n_jobs=-1, scoring="roc_auc", refit=True, verbose=False ) return model_nn @staticmethod def get_random_forest_model(**params_rf): """ Return a random forest model :return: """ if not all(params_rf.values()): params_rf = { "max_depth": [20], "min_samples_split": [10], "max_leaf_nodes": [175], "min_samples_leaf": [5], "n_estimators": [250], "max_features": ["sqrt"], } model_rf = RandomForestClassifier() model_rf = GridSearchCV( model_rf, params_rf, cv=3, n_jobs=-1, verbose=False, scoring="roc_auc", refit=True, ) return model_rf @staticmethod def get_light_gbm_model(**params_lgb): """ Return a LightGBM model :return: """ if not all(params_lgb.values()): params_lgb = { "learning_rate": [0.005, 0.01], "n_estimators": [8, 16, 24], "num_leaves": [ 6, 8, 12, 16, ], # large num_leaves helps improve accuracy but might lead to over-fitting "boosting_type": ["gbdt", "dart"], # for better accuracy -> try dart "objective": ["binary"], "max_bin": [ 255, 510, ], # large max_bin helps improve accuracy but might slow down training progress "random_state": [500], "colsample_bytree": [0.64, 0.65, 0.66], "subsample": [0.7, 0.75], "reg_alpha": [1, 1.2], "reg_lambda": [1, 1.2, 1.4], } model = lgb.LGBMClassifier() model = GridSearchCV( model, params_lgb, verbose=False, cv=3, n_jobs=-1, scoring="roc_auc", refit=True, ) return model @staticmethod def get_xgboost_model(**params_xgb): """ Return a xgboost model :return: """ if not all(params_xgb.values()): params_xgb = { "nthread": [4], # when use hyper thread, xgboost may become slower "objective": ["binary:logistic"], "learning_rate": [0.05], # so called `eta` value "max_depth": [6], "min_child_weight": [11], "silent": [1], "subsample": [0.8], "colsample_bytree": [0.7], "n_estimators": [ 100 ], # number of trees, change it to 1000 for better results "missing": [-999], "seed": [1337], } model = GridSearchCV( xgb.XGBClassifier(), params_xgb, n_jobs=-1, cv=3, scoring="roc_auc", refit=True, ) return model def fit_and_eval_model(self, x_train, x_test, y_train, y_test): """ Run the model with dataset :param x_train: :param x_test: :param y_train: :param y_test: :return: """ model_lr, accuracy_lr, roc_auc_lr, coh_kap_lr, tt_lr = self.__run_model( self.model, x_train, y_train, x_test, y_test ) return model_lr, accuracy_lr, roc_auc_lr, coh_kap_lr, tt_lr @staticmethod def get_gradient_boosting_model(**params): """ Return gradient boosting model :param params: :return: """ if not all(params.values()): params = { "learning_rate": [0.01, 0.02, 0.03], "min_samples_split": [5, 10], "min_samples_leaf": [3, 5], "max_depth": [3, 5, 10], "max_features": ["sqrt"], "n_estimators": [100, 200], } model = GradientBoostingClassifier(random_state=100) return GridSearchCV(model, params, cv=3, n_jobs=-1) base_df, data_df = data_preparing() x_train, x_test, y_train, y_test = create_dataset(data_df) ml_model = MLModel(DEFAULT_MODEL) ml_model.fit_and_eval_model(x_train, x_test, y_train, y_test)