|
"""Data pre-processing functions. |
|
|
|
The pre-processing steps are heavily inspired by the following notebook : |
|
https://www.kaggle.com/code/rikdifos/credit-card-approval-prediction-using-ml |
|
|
|
Additional steps, mostly including renaming some values or features, were added for better user |
|
experience. |
|
""" |
|
|
|
import numpy |
|
from sklearn.compose import ColumnTransformer |
|
from sklearn.pipeline import Pipeline |
|
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, KBinsDiscretizer |
|
|
|
|
|
def _get_pipeline_replace_one_hot(func, value): |
|
return Pipeline([ |
|
("replace", FunctionTransformer( |
|
func, |
|
kw_args={"value": value}, |
|
feature_names_out='one-to-one', |
|
)), |
|
("one_hot", OneHotEncoder(),), |
|
]) |
|
|
|
|
|
def _replace_values_geq(column, value): |
|
return numpy.where(column >= value, f"{value}_or_more", column) |
|
|
|
def _replace_values_eq(column, value): |
|
for desired_value, values_to_replace in value.items(): |
|
column = numpy.where(numpy.isin(column, values_to_replace), desired_value, column) |
|
return column |
|
|
|
def get_pre_processors(): |
|
pre_processor_user = ColumnTransformer( |
|
transformers=[ |
|
( |
|
"replace_num_children", |
|
_get_pipeline_replace_one_hot(_replace_values_geq, 2), |
|
['Num_children'] |
|
), |
|
( |
|
"replace_num_family", |
|
_get_pipeline_replace_one_hot(_replace_values_geq, 3), |
|
['Num_family'] |
|
), |
|
( |
|
"replace_income_type", |
|
_get_pipeline_replace_one_hot(_replace_values_eq, {"State servant": ["Pensioner", "Student"]}), |
|
['Income_type'] |
|
), |
|
( |
|
"replace_education_type", |
|
_get_pipeline_replace_one_hot(_replace_values_eq, {"Higher education": ["Academic degree"]}), |
|
['Education_type'] |
|
), |
|
( |
|
"replace_occupation_type_labor", |
|
_get_pipeline_replace_one_hot( |
|
_replace_values_eq, |
|
{ |
|
"Labor_work": ["Cleaning staff", "Cooking staff", "Drivers", "Laborers", "Low-skill Laborers", "Security staff", "Waiters/barmen staff"], |
|
"Office_work": ["Accountants", "Core staff", "HR staff", "Medicine staff", "Private service staff", "Realty agents", "Sales staff", "Secretaries"], |
|
"High_tech_work": ["Managers", "High skill tech staff", "IT staff"], |
|
}, |
|
), |
|
['Occupation_type'] |
|
), |
|
('one_hot_housing_fam_status', OneHotEncoder(), ['Housing_type', 'Family_status']), |
|
('qbin_total_income', KBinsDiscretizer(n_bins=3, strategy='quantile', encode="onehot"), ['Total_income']), |
|
('bin_age', KBinsDiscretizer(n_bins=5, strategy='uniform', encode="onehot"), ['Age']), |
|
], |
|
remainder='passthrough', |
|
verbose_feature_names_out=False, |
|
) |
|
|
|
pre_processor_third_party = ColumnTransformer( |
|
transformers=[ |
|
('bin_years_employed', KBinsDiscretizer(n_bins=5, strategy='uniform', encode="onehot"), ['Years_employed']) |
|
], |
|
remainder='passthrough', |
|
verbose_feature_names_out=False, |
|
) |
|
|
|
return pre_processor_user, pre_processor_third_party |
|
|
|
|
|
def select_and_pop_features(data, columns): |
|
new_data = data[columns].copy() |
|
data.drop(columns, axis=1, inplace=True) |
|
return new_data |