Spaces:

zama-fhe
/

encrypted_credit_scoring

Running

App Files Files Community

encrypted_credit_scoring / utils /pre_processing.py

romanbredehoft-zama

Add descriptions and fix comments

747c295 about 1 year ago

raw

history blame

3.55 kB

	"""Data pre-processing functions.

	The pre-processing steps are heavily inspired by the following notebook :
	https://www.kaggle.com/code/rikdifos/credit-card-approval-prediction-using-ml

	Additional steps, mostly including renaming some values or features, were added for better user
	experience.
	"""

	import numpy
	from sklearn.compose import ColumnTransformer
	from sklearn.pipeline import Pipeline
	from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, KBinsDiscretizer


	def _get_pipeline_replace_one_hot(func, value):
	return Pipeline([
	("replace", FunctionTransformer(
	func,
	kw_args={"value": value},
	feature_names_out='one-to-one',
	)),
	("one_hot", OneHotEncoder(),),
	])


	def _replace_values_geq(column, value):
	return numpy.where(column >= value, f"{value}_or_more", column)

	def _replace_values_eq(column, value):
	for desired_value, values_to_replace in value.items():
	column = numpy.where(numpy.isin(column, values_to_replace), desired_value, column)
	return column

	def get_pre_processors():
	pre_processor_user = ColumnTransformer(
	transformers=[
	(
	"replace_num_children",
	_get_pipeline_replace_one_hot(_replace_values_geq, 2),
	['Num_children']
	),
	(
	"replace_num_family",
	_get_pipeline_replace_one_hot(_replace_values_geq, 3),
	['Num_family']
	),
	(
	"replace_income_type",
	_get_pipeline_replace_one_hot(_replace_values_eq, {"State servant": ["Pensioner", "Student"]}),
	['Income_type']
	),
	(
	"replace_education_type",
	_get_pipeline_replace_one_hot(_replace_values_eq, {"Higher education": ["Academic degree"]}),
	['Education_type']
	),
	(
	"replace_occupation_type_labor",
	_get_pipeline_replace_one_hot(
	_replace_values_eq,
	{
	"Labor_work": ["Cleaning staff", "Cooking staff", "Drivers", "Laborers", "Low-skill Laborers", "Security staff", "Waiters/barmen staff"],
	"Office_work": ["Accountants", "Core staff", "HR staff", "Medicine staff", "Private service staff", "Realty agents", "Sales staff", "Secretaries"],
	"High_tech_work": ["Managers", "High skill tech staff", "IT staff"],
	},
	),
	['Occupation_type']
	),
	('one_hot_housing_fam_status', OneHotEncoder(), ['Housing_type', 'Family_status']),
	('qbin_total_income', KBinsDiscretizer(n_bins=3, strategy='quantile', encode="onehot"), ['Total_income']),
	('bin_age', KBinsDiscretizer(n_bins=5, strategy='uniform', encode="onehot"), ['Age']),
	],
	remainder='passthrough',
	verbose_feature_names_out=False,
	)

	pre_processor_third_party = ColumnTransformer(
	transformers=[
	('bin_years_employed', KBinsDiscretizer(n_bins=5, strategy='uniform', encode="onehot"), ['Years_employed'])
	],
	remainder='passthrough',
	verbose_feature_names_out=False,
	)

	return pre_processor_user, pre_processor_third_party


	def select_and_pop_features(data, columns):
	new_data = data[columns].copy()
	data.drop(columns, axis=1, inplace=True)
	return new_data