Spaces:

zama-fhe
/

encrypted_credit_scoring

Running

File size: 3,692 Bytes

"""Train and compile the model."""

import shutil
import numpy
import pandas
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

from settings import (
    DEPLOYMENT_PATH, 
    RANDOM_STATE, 
    DATA_PATH, 
    INPUT_SLICES, 
    PRE_PROCESSOR_USER_PATH, 
    PRE_PROCESSOR_THIRD_PARTY_PATH,
    USER_COLUMNS,
    BANK_COLUMNS,
    THIRD_PARTY_COLUMNS,
)
from utils.client_server_interface import MultiInputsFHEModelDev
from utils.model import MultiInputXGBClassifier
from utils.pre_processing import get_pre_processors


def get_processed_multi_inputs(data):
    return (
        data[:, INPUT_SLICES["user"]], 
        data[:, INPUT_SLICES["bank"]], 
        data[:, INPUT_SLICES["third_party"]]
    )

print("Load and pre-process the data")

# Original data set can be found here : 
# https://www.kaggle.com/datasets/rikdifos/credit-card-approval-prediction/data
# It then has been cleaned using the following notebook : 
# https://www.kaggle.com/code/samuelcortinhas/credit-cards-data-cleaning
# A few additional pre-processing steps has bee applied to this data set as well : 
# - "ID" column has been removed 
# - "Total_income" values have been multiplied by 0.14 to make its median match France's annual
#    salary one from 2023 (22050 euros) 
data = pandas.read_csv(DATA_PATH, encoding="utf-8")

# Define input and target data
data_x = data.copy()
data_y = data_x.pop("Target").copy()

# Get data from all parties
data_user = data_x[USER_COLUMNS].copy()
data_bank = data_x[BANK_COLUMNS].copy()
data_third_party = data_x[THIRD_PARTY_COLUMNS].copy()

# Feature engineer the data
pre_processor_user, pre_processor_third_party = get_pre_processors()

preprocessed_data_user = pre_processor_user.fit_transform(data_user)
preprocessed_data_bank = data_bank.to_numpy()
preprocessed_data_third_party = pre_processor_third_party.fit_transform(data_third_party)

preprocessed_data_x = numpy.concatenate((preprocessed_data_user, preprocessed_data_bank, preprocessed_data_third_party), axis=1)

# The initial data-set is very imbalanced: use SMOTE to get better results
x, y = SMOTE().fit_resample(preprocessed_data_x, data_y)

# Retrieve the training and testing data
X_train, X_test, y_train, y_test = train_test_split(
    x, y, stratify=y, test_size=0.3, random_state=RANDOM_STATE
)


print("\nTrain and compile the model")

model = MultiInputXGBClassifier(max_depth=3, n_estimators=40)

model, sklearn_model = model.fit_benchmark(X_train, y_train)
 
multi_inputs_train = get_processed_multi_inputs(X_train)

model.compile(*multi_inputs_train, inputs_encryption_status=["encrypted", "encrypted", "encrypted"])

# Delete the deployment folder and its content if it already exists
if DEPLOYMENT_PATH.is_dir():
    shutil.rmtree(DEPLOYMENT_PATH)


print("\nEvaluate the models")

y_pred_sklearn = sklearn_model.predict(X_test)

print(f"Sklearn accuracy score : {accuracy_score(y_test, y_pred_sklearn )*100:.2f}%")

multi_inputs_test = get_processed_multi_inputs(X_test)

y_pred_simulated = model.predict_multi_inputs(*multi_inputs_test, simulate=True)

print(f"Concrete ML accuracy score (simulated) : {accuracy_score(y_test, y_pred_simulated)*100:.2f}%")


print("\nSave deployment files")

# Save files needed for deployment (and enable cross-platform deployment)
fhe_dev = MultiInputsFHEModelDev(DEPLOYMENT_PATH, model)
fhe_dev.save(via_mlir=True)

# Save pre-processors
with PRE_PROCESSOR_USER_PATH.open('wb') as file:
    pickle.dump(pre_processor_user, file)

with PRE_PROCESSOR_THIRD_PARTY_PATH.open('wb') as file:
    pickle.dump(pre_processor_third_party, file)

print("\nDone !")