Spaces:
Runtime error
Runtime error
from logging import warning | |
import numpy as np | |
import pandas as pd | |
from aif360.datasets import StructuredDataset | |
from sklearn.preprocessing import MinMaxScaler | |
class RegressionDataset(StructuredDataset): | |
"""Base class for regression datasets.""" | |
def __init__(self, df, dep_var_name, protected_attribute_names, | |
privileged_classes, instance_weights_name='', | |
categorical_features=[], na_values=[], | |
custom_preprocessing=None, metadata=None): | |
""" | |
Subclasses of RegressionDataset should perform the following before | |
calling `super().__init__`: | |
1. Load the dataframe from a raw file. | |
Then, this class will go through a standard preprocessing routine which: | |
2. (optional) Performs some dataset-specific preprocessing (e.g. | |
renaming columns/values, handling missing data). | |
3. Drops rows with NA values. | |
4. Creates a one-hot encoding of the categorical variables. | |
5. Maps protected attributes to binary privileged/unprivileged | |
values (1/0). | |
6. Normalizes df values | |
Args: | |
df (pandas.DataFrame): DataFrame on which to perform standard | |
processing. | |
dep_var_name: Name of the dependent variable column in `df`. | |
protected_attribute_names (list): List of names corresponding to | |
protected attribute columns in `df`. | |
privileged_classes (list(list or function)): Each element is | |
a list of values which are considered privileged or a boolean | |
function which return `True` if privileged for the corresponding | |
column in `protected_attribute_names`. All others are | |
unprivileged. Values are mapped to 1 (privileged) and 0 | |
(unprivileged) if they are not already numerical. | |
instance_weights_name (optional): Name of the instance weights | |
column in `df`. | |
categorical_features (optional, list): List of column names in the | |
DataFrame which are to be expanded into one-hot vectors. | |
na_values (optional): Additional strings to recognize as NA. See | |
:func:`pandas.read_csv` for details. | |
custom_preprocessing (function): A function object which | |
acts on and returns a DataFrame (f: DataFrame -> DataFrame). If | |
`None`, no extra preprocessing is applied. | |
metadata (optional): Additional metadata to append. | |
""" | |
# 2. Perform dataset-specific preprocessing | |
if custom_preprocessing: | |
df = custom_preprocessing(df) | |
# 3. Remove any rows that have missing data. | |
dropped = df.dropna() | |
count = df.shape[0] - dropped.shape[0] | |
if count > 0: | |
warning("Missing Data: {} rows removed from {}.".format(count, | |
type(self).__name__)) | |
df = dropped | |
# 4. Create a one-hot encoding of the categorical variables. | |
df = pd.get_dummies(df, columns=categorical_features, prefix_sep='=') | |
# 5. Map protected attributes to privileged/unprivileged | |
privileged_protected_attributes = [] | |
unprivileged_protected_attributes = [] | |
for attr, vals in zip(protected_attribute_names, privileged_classes): | |
privileged_values = [1.] | |
unprivileged_values = [0.] | |
if callable(vals): | |
df[attr] = df[attr].apply(vals) | |
elif np.issubdtype(df[attr].dtype, np.number): | |
# this attribute is numeric; no remapping needed | |
privileged_values = vals | |
unprivileged_values = list(set(df[attr]).difference(vals)) | |
else: | |
# find all instances which match any of the attribute values | |
priv = np.logical_or.reduce(np.equal.outer(vals, df[attr].to_numpy())) | |
df.loc[priv, attr] = privileged_values[0] | |
df.loc[~priv, attr] = unprivileged_values[0] | |
privileged_protected_attributes.append( | |
np.array(privileged_values, dtype=np.float64)) | |
unprivileged_protected_attributes.append( | |
np.array(unprivileged_values, dtype=np.float64)) | |
# 6. Normalize df values | |
df = pd.DataFrame(MinMaxScaler().fit_transform(df.values), | |
columns=list(df), index=df.index) | |
super(RegressionDataset, self).__init__(df=df, | |
label_names=[dep_var_name], | |
protected_attribute_names=protected_attribute_names, | |
privileged_protected_attributes=privileged_protected_attributes, | |
unprivileged_protected_attributes=unprivileged_protected_attributes, | |
instance_weights_name=instance_weights_name, | |
scores_names=[], | |
metadata=metadata) | |