FairUP / src /aif360 /datasets /regression_dataset.py
erasmopurif's picture
First commit
d2a8669
from logging import warning
import numpy as np
import pandas as pd
from aif360.datasets import StructuredDataset
from sklearn.preprocessing import MinMaxScaler
class RegressionDataset(StructuredDataset):
"""Base class for regression datasets."""
def __init__(self, df, dep_var_name, protected_attribute_names,
privileged_classes, instance_weights_name='',
categorical_features=[], na_values=[],
custom_preprocessing=None, metadata=None):
"""
Subclasses of RegressionDataset should perform the following before
calling `super().__init__`:
1. Load the dataframe from a raw file.
Then, this class will go through a standard preprocessing routine which:
2. (optional) Performs some dataset-specific preprocessing (e.g.
renaming columns/values, handling missing data).
3. Drops rows with NA values.
4. Creates a one-hot encoding of the categorical variables.
5. Maps protected attributes to binary privileged/unprivileged
values (1/0).
6. Normalizes df values
Args:
df (pandas.DataFrame): DataFrame on which to perform standard
processing.
dep_var_name: Name of the dependent variable column in `df`.
protected_attribute_names (list): List of names corresponding to
protected attribute columns in `df`.
privileged_classes (list(list or function)): Each element is
a list of values which are considered privileged or a boolean
function which return `True` if privileged for the corresponding
column in `protected_attribute_names`. All others are
unprivileged. Values are mapped to 1 (privileged) and 0
(unprivileged) if they are not already numerical.
instance_weights_name (optional): Name of the instance weights
column in `df`.
categorical_features (optional, list): List of column names in the
DataFrame which are to be expanded into one-hot vectors.
na_values (optional): Additional strings to recognize as NA. See
:func:`pandas.read_csv` for details.
custom_preprocessing (function): A function object which
acts on and returns a DataFrame (f: DataFrame -> DataFrame). If
`None`, no extra preprocessing is applied.
metadata (optional): Additional metadata to append.
"""
# 2. Perform dataset-specific preprocessing
if custom_preprocessing:
df = custom_preprocessing(df)
# 3. Remove any rows that have missing data.
dropped = df.dropna()
count = df.shape[0] - dropped.shape[0]
if count > 0:
warning("Missing Data: {} rows removed from {}.".format(count,
type(self).__name__))
df = dropped
# 4. Create a one-hot encoding of the categorical variables.
df = pd.get_dummies(df, columns=categorical_features, prefix_sep='=')
# 5. Map protected attributes to privileged/unprivileged
privileged_protected_attributes = []
unprivileged_protected_attributes = []
for attr, vals in zip(protected_attribute_names, privileged_classes):
privileged_values = [1.]
unprivileged_values = [0.]
if callable(vals):
df[attr] = df[attr].apply(vals)
elif np.issubdtype(df[attr].dtype, np.number):
# this attribute is numeric; no remapping needed
privileged_values = vals
unprivileged_values = list(set(df[attr]).difference(vals))
else:
# find all instances which match any of the attribute values
priv = np.logical_or.reduce(np.equal.outer(vals, df[attr].to_numpy()))
df.loc[priv, attr] = privileged_values[0]
df.loc[~priv, attr] = unprivileged_values[0]
privileged_protected_attributes.append(
np.array(privileged_values, dtype=np.float64))
unprivileged_protected_attributes.append(
np.array(unprivileged_values, dtype=np.float64))
# 6. Normalize df values
df = pd.DataFrame(MinMaxScaler().fit_transform(df.values),
columns=list(df), index=df.index)
super(RegressionDataset, self).__init__(df=df,
label_names=[dep_var_name],
protected_attribute_names=protected_attribute_names,
privileged_protected_attributes=privileged_protected_attributes,
unprivileged_protected_attributes=unprivileged_protected_attributes,
instance_weights_name=instance_weights_name,
scores_names=[],
metadata=metadata)