|
"""Train and compile the model.""" |
|
|
|
import shutil |
|
import numpy |
|
import pandas |
|
import pickle |
|
|
|
from sklearn.model_selection import train_test_split |
|
from sklearn.metrics import accuracy_score |
|
from imblearn.over_sampling import SMOTE |
|
|
|
from settings import DEPLOYMENT_PATH, RANDOM_STATE, DATA_PATH, INPUT_SLICES, PRE_PROCESSOR_USER_PATH, PRE_PROCESSOR_THIRD_PARTY_PATH |
|
from utils.client_server_interface import MultiInputsFHEModelDev |
|
from utils.model import MultiInputXGBClassifier |
|
from utils.pre_processing import get_pre_processors, select_and_pop_features |
|
|
|
|
|
def get_processed_multi_inputs(data): |
|
return ( |
|
data[:, INPUT_SLICES["user"]], |
|
data[:, INPUT_SLICES["bank"]], |
|
data[:, INPUT_SLICES["third_party"]] |
|
) |
|
|
|
print("Load and pre-process the data") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data = pandas.read_csv(DATA_PATH, encoding="utf-8") |
|
|
|
|
|
data_y = data.pop("Target").copy() |
|
data_x = data.copy() |
|
|
|
|
|
data_third_party = select_and_pop_features(data_x, ["Years_employed", "Unemployed"]) |
|
data_bank = select_and_pop_features(data_x, ["Account_length"]) |
|
data_user = data_x.copy() |
|
|
|
|
|
pre_processor_user, pre_processor_third_party = get_pre_processors() |
|
|
|
preprocessed_data_user = pre_processor_user.fit_transform(data_user) |
|
preprocessed_data_bank = data_bank.to_numpy() |
|
preprocessed_data_third_party = pre_processor_third_party.fit_transform(data_third_party) |
|
|
|
preprocessed_data_x = numpy.concatenate((preprocessed_data_user, preprocessed_data_bank, preprocessed_data_third_party), axis=1) |
|
|
|
|
|
x, y = SMOTE().fit_resample(preprocessed_data_x, data_y) |
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split( |
|
x, y, stratify=y, test_size=0.3, random_state=RANDOM_STATE |
|
) |
|
|
|
|
|
print("\nTrain and compile the model") |
|
|
|
model = MultiInputXGBClassifier(max_depth=3, n_estimators=40) |
|
|
|
model, sklearn_model = model.fit_benchmark(X_train, y_train) |
|
|
|
multi_inputs_train = get_processed_multi_inputs(X_train) |
|
|
|
model.compile(*multi_inputs_train, inputs_encryption_status=["encrypted", "encrypted", "encrypted"]) |
|
|
|
|
|
if DEPLOYMENT_PATH.is_dir(): |
|
shutil.rmtree(DEPLOYMENT_PATH) |
|
|
|
|
|
print("\nEvaluate the models") |
|
|
|
y_pred_sklearn = sklearn_model.predict(X_test) |
|
|
|
print(f"Sklearn accuracy score : {accuracy_score(y_test, y_pred_sklearn )*100:.2f}%") |
|
|
|
multi_inputs_test = get_processed_multi_inputs(X_test) |
|
|
|
y_pred_simulated = model.predict_multi_inputs(*multi_inputs_test, simulate=True) |
|
|
|
print(f"Concrete ML accuracy score (simulated) : {accuracy_score(y_test, y_pred_simulated)*100:.2f}%") |
|
|
|
|
|
print("\nSave deployment files") |
|
|
|
|
|
fhe_dev = MultiInputsFHEModelDev(DEPLOYMENT_PATH, model) |
|
fhe_dev.save(via_mlir=True) |
|
|
|
|
|
with PRE_PROCESSOR_USER_PATH.open('wb') as file: |
|
pickle.dump(pre_processor_user, file) |
|
|
|
with PRE_PROCESSOR_THIRD_PARTY_PATH.open('wb') as file: |
|
pickle.dump(pre_processor_third_party, file) |
|
|
|
print("\nDone !") |
|
|