Spaces:
Runtime error
Runtime error
from collections import namedtuple | |
import warnings | |
import numpy as np | |
import pandas as pd | |
from pandas.api.types import is_list_like, is_numeric_dtype | |
Dataset = namedtuple('Dataset', ['X', 'y']) | |
WeightedDataset = namedtuple('WeightedDataset', ['X', 'y', 'sample_weight']) | |
class NumericConversionWarning(UserWarning): | |
"""Warning used if protected attribute or target is unable to be converted | |
automatically to a numeric type.""" | |
def standardize_dataset(df, *, prot_attr, target, sample_weight=None, | |
usecols=None, dropcols=None, numeric_only=False, dropna=True): | |
"""Separate data, targets, and possibly sample weights and populate | |
protected attributes as sample properties. | |
Args: | |
df (pandas.DataFrame): DataFrame with features and, optionally, target. | |
prot_attr (label or array-like or list of labels/arrays): Label, array | |
of the same length as `df`, or a list containing any combination of | |
the two corresponding to protected attribute columns. Even if these | |
are dropped from the features, they remain in the index. Column(s) | |
indicated by label will be copied from `df`, not dropped. Column(s) | |
passed explicitly as arrays will not be added to features. | |
target (label or array-like or list of labels/arrays): Label, array of | |
the same length as `df`, or a list containing any combination of the | |
two corresponding to the target (outcome) variable. Column(s) | |
indicated by label will be dropped from features. | |
sample_weight (single label or array-like, optional): Name of the column | |
containing sample weights or an array of sample weights of the same | |
length as `df`. If a label is passed, the column is dropped from | |
features. Note: the index of a passed Series will be ignored. | |
usecols (list-like, optional): Column(s) to keep. All others are | |
dropped. | |
dropcols (list-like, optional): Column(s) to drop. Missing labels are | |
ignored. | |
numeric_only (bool): Drop all non-numeric, non-binary feature columns. | |
dropna (bool): Drop rows with NAs. | |
Returns: | |
collections.namedtuple: | |
A tuple-like object where items can be accessed by index or name. | |
Contains the following attributes: | |
* **X** (`pandas.DataFrame`) -- Feature array. | |
* **y** (`pandas.DataFrame` or `pandas.Series`) -- Target array. | |
* **sample_weight** (`pandas.Series`, optional) -- Sample weights. | |
Note: | |
The order of execution for the dropping parameters is: usecols -> | |
dropcols -> numeric_only -> dropna. | |
Examples: | |
>>> import pandas as pd | |
>>> from sklearn.linear_model import LinearRegression | |
>>> df = pd.DataFrame([[0.5, 1, 1, 0.75], [-0.5, 0, 0, 0.25]], | |
... columns=['X', 'y', 'Z', 'w']) | |
>>> train = standardize_dataset(df, prot_attr='Z', target='y', | |
... sample_weight='w') | |
>>> reg = LinearRegression().fit(**train._asdict()) | |
>>> import numpy as np | |
>>> from sklearn.datasets import make_classification | |
>>> from sklearn.model_selection import train_test_split | |
>>> df = pd.DataFrame(np.hstack(make_classification(n_features=5))) | |
>>> X, y = standardize_dataset(df, prot_attr=0, target=5) | |
>>> X_tr, X_te, y_tr, y_te = train_test_split(X, y) | |
""" | |
if numeric_only: | |
for col in df.select_dtypes('category'): | |
if df[col].cat.ordered: | |
df[col] = df[col].factorize(sort=True)[0] | |
df[col] = df[col].replace(-1, np.nan) | |
# protected attribute(s) | |
df = df.set_index(prot_attr, drop=False) | |
pa = df.index | |
# target(s) | |
df = df.set_index(target, drop=True) # utilize set_index logic for mixed types | |
y = df.index.to_frame().squeeze() | |
df.index = y.index = pa | |
# sample weight | |
if sample_weight is not None: | |
sw = pd.Series(sample_weight) if is_list_like(sample_weight) else \ | |
df.pop(sample_weight) | |
sw.index = pa | |
# Column-wise drops | |
if usecols: | |
if not is_list_like(usecols): | |
usecols = [usecols] # ensure output is DataFrame, not Series | |
df = df.loc[:, usecols] | |
if dropcols: | |
df = df.drop(columns=dropcols, errors='ignore') | |
if numeric_only: | |
df = df.select_dtypes(['number', 'bool']) | |
# warn if nonnumeric prot_attr or target but proceed | |
if any(not is_numeric_dtype(dt) for dt in pa.to_frame().dtypes): | |
warnings.warn(f"index contains non-numeric:\n{pa.to_frame().dtypes}", | |
category=NumericConversionWarning) | |
if any(not is_numeric_dtype(dt) for dt in y.to_frame().dtypes): | |
warnings.warn(f"y contains non-numeric column:\n{y.to_frame().dtypes}", | |
category=NumericConversionWarning) | |
# Index-wise drops | |
if dropna: | |
notna = df.notna().all(axis=1) & y.notna() & pa.to_frame().notna().all(axis=1) | |
if sample_weight is not None: | |
notna &= sw.notna() | |
sw = sw.loc[notna] | |
df = df.loc[notna] | |
y = y.loc[notna] | |
for col in df.select_dtypes('category'): | |
df[col] = df[col].cat.remove_unused_categories() | |
return Dataset(df, y) if sample_weight is None else WeightedDataset(df, y, sw) | |