encrypted_credit_scoring / development.py
romanbredehoft-zama's picture
Enable cross-platform deployment
ec21179
raw
history blame
3.15 kB
"""Train and compile the model."""
import shutil
import numpy
import pandas
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from settings import DEPLOYMENT_PATH, RANDOM_STATE, DATA_PATH, INPUT_SLICES, PRE_PROCESSOR_USER_PATH, PRE_PROCESSOR_THIRD_PARTY_PATH
from utils.client_server_interface import MultiInputsFHEModelDev
from utils.model import MultiInputXGBClassifier
from utils.pre_processing import get_pre_processors, select_and_pop_features
def get_processed_multi_inputs(data):
return (
data[:, INPUT_SLICES["user"]],
data[:, INPUT_SLICES["bank"]],
data[:, INPUT_SLICES["third_party"]]
)
print("Load and pre-process the data")
data = pandas.read_csv(DATA_PATH, encoding="utf-8")
# Define input and target data
data_y = data.pop("Target").copy()
data_x = data.copy()
# Get data from all parties
data_third_party = select_and_pop_features(data_x, ["Years_employed", "Unemployed"])
data_bank = select_and_pop_features(data_x, ["Account_length"])
data_user = data_x.copy()
# Feature engineer the data
pre_processor_user, pre_processor_third_party = get_pre_processors()
preprocessed_data_user = pre_processor_user.fit_transform(data_user)
preprocessed_data_bank = data_bank.to_numpy()
preprocessed_data_third_party = pre_processor_third_party.fit_transform(data_third_party)
preprocessed_data_x = numpy.concatenate((preprocessed_data_user, preprocessed_data_bank, preprocessed_data_third_party), axis=1)
# The initial data-set is very imbalanced: use SMOTE to get better results
x, y = SMOTE().fit_resample(preprocessed_data_x, data_y)
# Retrieve the training and testing data
X_train, X_test, y_train, y_test = train_test_split(
x, y, stratify=y, test_size=0.3, random_state=RANDOM_STATE
)
print("\nTrain and compile the model")
model = MultiInputXGBClassifier(max_depth=3, n_estimators=40)
model, sklearn_model = model.fit_benchmark(X_train, y_train)
multi_inputs_train = get_processed_multi_inputs(X_train)
model.compile(*multi_inputs_train, inputs_encryption_status=["encrypted", "encrypted", "encrypted"])
# Delete the deployment folder and its content if it already exists
if DEPLOYMENT_PATH.is_dir():
shutil.rmtree(DEPLOYMENT_PATH)
print("\nEvaluate the models")
y_pred_sklearn = sklearn_model.predict(X_test)
print(f"Sklearn accuracy score : {accuracy_score(y_test, y_pred_sklearn )*100:.2f}%")
multi_inputs_test = get_processed_multi_inputs(X_test)
y_pred_simulated = model.predict_multi_inputs(*multi_inputs_test, simulate=True)
print(f"Concrete ML accuracy score (simulated) : {accuracy_score(y_test, y_pred_simulated)*100:.2f}%")
print("\nSave deployment files")
# Save files needed for deployment (and enable cross-platform deployment)
fhe_dev = MultiInputsFHEModelDev(DEPLOYMENT_PATH, model)
fhe_dev.save(via_mlir=True)
# Save pre-processors
with PRE_PROCESSOR_USER_PATH.open('wb') as file:
pickle.dump(pre_processor_user, file)
with PRE_PROCESSOR_THIRD_PARTY_PATH.open('wb') as file:
pickle.dump(pre_processor_third_party, file)
print("\nDone !")