File size: 2,322 Bytes
18ba8c1 9a997e4 18ba8c1 9a997e4 18ba8c1 9a997e4 18ba8c1 9a997e4 b47829b 9a997e4 18ba8c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
"""Data pre-processing functions."""
import numpy
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
def _get_pipeline_replace_one_hot(func, value):
return Pipeline([
("replace", FunctionTransformer(
func,
kw_args={"value": value},
feature_names_out='one-to-one',
)),
("one_hot", OneHotEncoder(),),
])
def _replace_values_eq(column, value):
for desired_value, values_to_replace in value.items():
column = numpy.where(numpy.isin(column, values_to_replace), desired_value, column)
return column
def get_pre_processors():
pre_processor_user = ColumnTransformer(
transformers=[
(
"replace_occupation_type_labor",
_get_pipeline_replace_one_hot(
_replace_values_eq,
{
"Labor_work": [
"Cooking Staff", "Carpenter", "Plumber", "Factory Worker", "Bus Driver"
],
"Office_work": [
"Business Owners", "Office Worker", "Accountant", "Entrepreneur", "Salesperson"
],
"High_tech_work": ["Engineer", "Manager", "Consultant", "Software Developer"],
},
),
['Occupation_type']
),
('one_hot_others', OneHotEncoder(), ['Housing_type', 'Family_status', 'Education_type', 'Income_type']),
('standard_scaler', StandardScaler(), ['Num_children', 'Household_size', 'Total_income', 'Age']),
],
remainder='passthrough',
verbose_feature_names_out=False,
)
pre_processor_bank = ColumnTransformer(
transformers=[
('standard_scaler', StandardScaler(), ['Account_age']),
],
remainder='passthrough',
verbose_feature_names_out=False,
)
pre_processor_third_party = ColumnTransformer(
transformers=[],
remainder='passthrough',
verbose_feature_names_out=False,
)
return pre_processor_user, pre_processor_bank, pre_processor_third_party |