File size: 5,456 Bytes
d2a8669
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from collections import namedtuple
import warnings

import numpy as np
import pandas as pd
from pandas.api.types import is_list_like, is_numeric_dtype


Dataset = namedtuple('Dataset', ['X', 'y'])
WeightedDataset = namedtuple('WeightedDataset', ['X', 'y', 'sample_weight'])

class NumericConversionWarning(UserWarning):
    """Warning used if protected attribute or target is unable to be converted
    automatically to a numeric type."""

def standardize_dataset(df, *, prot_attr, target, sample_weight=None,
        usecols=None, dropcols=None, numeric_only=False, dropna=True):
    """Separate data, targets, and possibly sample weights and populate
    protected attributes as sample properties.

    Args:
        df (pandas.DataFrame): DataFrame with features and, optionally, target.
        prot_attr (label or array-like or list of labels/arrays): Label, array
            of the same length as `df`, or a list containing any combination of
            the two corresponding to protected attribute columns. Even if these
            are dropped from the features, they remain in the index. Column(s)
            indicated by label will be copied from `df`, not dropped. Column(s)
            passed explicitly as arrays will not be added to features.
        target (label or array-like or list of labels/arrays): Label, array of
            the same length as `df`, or a list containing any combination of the
            two corresponding to the target (outcome) variable. Column(s)
            indicated by label will be dropped from features.
        sample_weight (single label or array-like, optional): Name of the column
            containing sample weights or an array of sample weights of the same
            length as `df`. If a label is passed, the column is dropped from
            features. Note: the index of a passed Series will be ignored.
        usecols (list-like, optional): Column(s) to keep. All others are
            dropped.
        dropcols (list-like, optional): Column(s) to drop. Missing labels are
            ignored.
        numeric_only (bool): Drop all non-numeric, non-binary feature columns.
        dropna (bool): Drop rows with NAs.

    Returns:
        collections.namedtuple:

            A tuple-like object where items can be accessed by index or name.
            Contains the following attributes:

            * **X** (`pandas.DataFrame`) -- Feature array.

            * **y** (`pandas.DataFrame` or `pandas.Series`) -- Target array.

            * **sample_weight** (`pandas.Series`, optional) -- Sample weights.

    Note:
        The order of execution for the dropping parameters is: usecols ->
        dropcols -> numeric_only -> dropna.

    Examples:
        >>> import pandas as pd
        >>> from sklearn.linear_model import LinearRegression

        >>> df = pd.DataFrame([[0.5, 1, 1, 0.75], [-0.5, 0, 0, 0.25]],
        ...                   columns=['X', 'y', 'Z', 'w'])
        >>> train = standardize_dataset(df, prot_attr='Z', target='y',
        ...                             sample_weight='w')
        >>> reg = LinearRegression().fit(**train._asdict())

        >>> import numpy as np
        >>> from sklearn.datasets import make_classification
        >>> from sklearn.model_selection import train_test_split
        >>> df = pd.DataFrame(np.hstack(make_classification(n_features=5)))
        >>> X, y = standardize_dataset(df, prot_attr=0, target=5)
        >>> X_tr, X_te, y_tr, y_te = train_test_split(X, y)
    """
    if numeric_only:
        for col in df.select_dtypes('category'):
            if df[col].cat.ordered:
                df[col] = df[col].factorize(sort=True)[0]
                df[col] = df[col].replace(-1, np.nan)

    # protected attribute(s)
    df = df.set_index(prot_attr, drop=False)
    pa = df.index

    # target(s)
    df = df.set_index(target, drop=True)  # utilize set_index logic for mixed types
    y = df.index.to_frame().squeeze()
    df.index = y.index = pa

    # sample weight
    if sample_weight is not None:
        sw = pd.Series(sample_weight) if is_list_like(sample_weight) else \
             df.pop(sample_weight)
        sw.index = pa

    # Column-wise drops
    if usecols:
        if not is_list_like(usecols):
            usecols = [usecols]  # ensure output is DataFrame, not Series
        df = df.loc[:, usecols]
    if dropcols:
        df = df.drop(columns=dropcols, errors='ignore')
    if numeric_only:
        df = df.select_dtypes(['number', 'bool'])
        # warn if nonnumeric prot_attr or target but proceed
        if any(not is_numeric_dtype(dt) for dt in pa.to_frame().dtypes):
            warnings.warn(f"index contains non-numeric:\n{pa.to_frame().dtypes}",
                          category=NumericConversionWarning)
        if any(not is_numeric_dtype(dt) for dt in y.to_frame().dtypes):
            warnings.warn(f"y contains non-numeric column:\n{y.to_frame().dtypes}",
                          category=NumericConversionWarning)

    # Index-wise drops
    if dropna:
        notna = df.notna().all(axis=1) & y.notna() & pa.to_frame().notna().all(axis=1)
        if sample_weight is not None:
            notna &= sw.notna()
            sw = sw.loc[notna]
        df = df.loc[notna]
        y = y.loc[notna]

    for col in df.select_dtypes('category'):
        df[col] = df[col].cat.remove_unused_categories()

    return Dataset(df, y) if sample_weight is None else WeightedDataset(df, y, sw)