erasmopurif's picture
First commit
d2a8669
from collections import namedtuple
import warnings
import numpy as np
import pandas as pd
from pandas.api.types import is_list_like, is_numeric_dtype
Dataset = namedtuple('Dataset', ['X', 'y'])
WeightedDataset = namedtuple('WeightedDataset', ['X', 'y', 'sample_weight'])
class NumericConversionWarning(UserWarning):
"""Warning used if protected attribute or target is unable to be converted
automatically to a numeric type."""
def standardize_dataset(df, *, prot_attr, target, sample_weight=None,
usecols=None, dropcols=None, numeric_only=False, dropna=True):
"""Separate data, targets, and possibly sample weights and populate
protected attributes as sample properties.
Args:
df (pandas.DataFrame): DataFrame with features and, optionally, target.
prot_attr (label or array-like or list of labels/arrays): Label, array
of the same length as `df`, or a list containing any combination of
the two corresponding to protected attribute columns. Even if these
are dropped from the features, they remain in the index. Column(s)
indicated by label will be copied from `df`, not dropped. Column(s)
passed explicitly as arrays will not be added to features.
target (label or array-like or list of labels/arrays): Label, array of
the same length as `df`, or a list containing any combination of the
two corresponding to the target (outcome) variable. Column(s)
indicated by label will be dropped from features.
sample_weight (single label or array-like, optional): Name of the column
containing sample weights or an array of sample weights of the same
length as `df`. If a label is passed, the column is dropped from
features. Note: the index of a passed Series will be ignored.
usecols (list-like, optional): Column(s) to keep. All others are
dropped.
dropcols (list-like, optional): Column(s) to drop. Missing labels are
ignored.
numeric_only (bool): Drop all non-numeric, non-binary feature columns.
dropna (bool): Drop rows with NAs.
Returns:
collections.namedtuple:
A tuple-like object where items can be accessed by index or name.
Contains the following attributes:
* **X** (`pandas.DataFrame`) -- Feature array.
* **y** (`pandas.DataFrame` or `pandas.Series`) -- Target array.
* **sample_weight** (`pandas.Series`, optional) -- Sample weights.
Note:
The order of execution for the dropping parameters is: usecols ->
dropcols -> numeric_only -> dropna.
Examples:
>>> import pandas as pd
>>> from sklearn.linear_model import LinearRegression
>>> df = pd.DataFrame([[0.5, 1, 1, 0.75], [-0.5, 0, 0, 0.25]],
... columns=['X', 'y', 'Z', 'w'])
>>> train = standardize_dataset(df, prot_attr='Z', target='y',
... sample_weight='w')
>>> reg = LinearRegression().fit(**train._asdict())
>>> import numpy as np
>>> from sklearn.datasets import make_classification
>>> from sklearn.model_selection import train_test_split
>>> df = pd.DataFrame(np.hstack(make_classification(n_features=5)))
>>> X, y = standardize_dataset(df, prot_attr=0, target=5)
>>> X_tr, X_te, y_tr, y_te = train_test_split(X, y)
"""
if numeric_only:
for col in df.select_dtypes('category'):
if df[col].cat.ordered:
df[col] = df[col].factorize(sort=True)[0]
df[col] = df[col].replace(-1, np.nan)
# protected attribute(s)
df = df.set_index(prot_attr, drop=False)
pa = df.index
# target(s)
df = df.set_index(target, drop=True) # utilize set_index logic for mixed types
y = df.index.to_frame().squeeze()
df.index = y.index = pa
# sample weight
if sample_weight is not None:
sw = pd.Series(sample_weight) if is_list_like(sample_weight) else \
df.pop(sample_weight)
sw.index = pa
# Column-wise drops
if usecols:
if not is_list_like(usecols):
usecols = [usecols] # ensure output is DataFrame, not Series
df = df.loc[:, usecols]
if dropcols:
df = df.drop(columns=dropcols, errors='ignore')
if numeric_only:
df = df.select_dtypes(['number', 'bool'])
# warn if nonnumeric prot_attr or target but proceed
if any(not is_numeric_dtype(dt) for dt in pa.to_frame().dtypes):
warnings.warn(f"index contains non-numeric:\n{pa.to_frame().dtypes}",
category=NumericConversionWarning)
if any(not is_numeric_dtype(dt) for dt in y.to_frame().dtypes):
warnings.warn(f"y contains non-numeric column:\n{y.to_frame().dtypes}",
category=NumericConversionWarning)
# Index-wise drops
if dropna:
notna = df.notna().all(axis=1) & y.notna() & pa.to_frame().notna().all(axis=1)
if sample_weight is not None:
notna &= sw.notna()
sw = sw.loc[notna]
df = df.loc[notna]
y = y.loc[notna]
for col in df.select_dtypes('category'):
df[col] = df[col].cat.remove_unused_categories()
return Dataset(df, y) if sample_weight is None else WeightedDataset(df, y, sw)