File size: 3,285 Bytes
9a997e4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
"""Data pre-processing functions."""
import numpy
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, KBinsDiscretizer
def _get_pipeline_replace_one_hot(func, value):
return Pipeline([
("replace", FunctionTransformer(
func,
kw_args={"value": value},
feature_names_out='one-to-one',
)),
("one_hot", OneHotEncoder(),),
])
def _replace_values_geq(column, value):
return numpy.where(column >= value, f"{value}_or_more", column)
def _replace_values_eq(column, value):
for desired_value, values_to_replace in value.items():
column = numpy.where(numpy.isin(column, values_to_replace), desired_value, column)
return column
def get_pre_processors():
pre_processor_user = ColumnTransformer(
transformers=[
(
"replace_num_children",
_get_pipeline_replace_one_hot(_replace_values_geq, 2),
['Num_children']
),
(
"replace_num_family",
_get_pipeline_replace_one_hot(_replace_values_geq, 3),
['Num_family']
),
(
"replace_income_type",
_get_pipeline_replace_one_hot(_replace_values_eq, {"State servant": ["Pensioner", "Student"]}),
['Income_type']
),
(
"replace_education_type",
_get_pipeline_replace_one_hot(_replace_values_eq, {"Higher education": ["Academic degree"]}),
['Education_type']
),
(
"replace_occupation_type_labor",
_get_pipeline_replace_one_hot(
_replace_values_eq,
{
"Labor_work": ["Cleaning staff", "Cooking staff", "Drivers", "Laborers", "Low-skill Laborers", "Security staff", "Waiters/barmen staff"],
"Office_work": ["Accountants", "Core staff", "HR staff", "Medicine staff", "Private service staff", "Realty agents", "Sales staff", "Secretaries"],
"High_tech_work": ["Managers", "High skill tech staff", "IT staff"],
},
),
['Occupation_type']
),
('one_hot_housing_fam_status', OneHotEncoder(), ['Housing_type', 'Family_status']),
('qbin_total_income', KBinsDiscretizer(n_bins=3, strategy='quantile', encode="onehot"), ['Total_income']),
('bin_age', KBinsDiscretizer(n_bins=5, strategy='uniform', encode="onehot"), ['Age']),
],
remainder='passthrough',
verbose_feature_names_out=False,
)
pre_processor_third_party = ColumnTransformer(
transformers=[
('bin_years_employed', KBinsDiscretizer(n_bins=5, strategy='uniform', encode="onehot"), ['Years_employed'])
],
remainder='passthrough',
verbose_feature_names_out=False,
)
return pre_processor_user, pre_processor_third_party
def select_and_pop_features(data, columns):
new_data = data[columns].copy()
data.drop(columns, axis=1, inplace=True)
return new_data |