erasmopurif's picture
First commit
d2a8669
# Copyright 2019 Seth V. Neel, Michael J. Kearns, Aaron L. Roth, Zhiwei Steven Wu
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
"""Functions for manipulating and loading input data."""
import argparse
import numpy as np
import pandas as pd
def setup():
parser = argparse.ArgumentParser(description='Fairness Data Cleaning')
parser.add_argument(
'-n',
'--name',
type=str,
help='name of the to store the new datasets (Required)')
parser.add_argument('-d',
'--dataset',
type=str,
help='name of the original dataset file (Required)')
parser.add_argument(
'-a',
'--attributes',
type=str,
help=
'name of the file representing which attributes are protected (unprotected = 0, protected = 1, label = 2) (Required)'
)
parser.add_argument(
'-c',
'--centered',
default=False,
action='store_true',
required=False,
help='Include this flag to determine whether data should be centered')
args = parser.parse_args()
return [args.name, args.dataset, args.attributes, args.centered]
def clean_dataset(dataset, attributes, centered):
"""Clean a dataset, given the filename for the dataset and the filename for the attributes.
Args:
:param dataset: Filename for dataset. The dataset should be formatted such that categorical
variables use one-hot encoding
and the label should be 0/1
:param attributes: Filename for the attributes of the dataset. The file should have each column name in a list,
and under this list should have 0 for an unprotected attribute, 1 for a protected attribute, and 2 for the
attribute of the label.
:param centered: boolean flag that determines whether to center the input covariates.
:return X, X_prime, y: pandas dataframes of attributes, sensitive attributes, labels
"""
df = pd.read_csv(dataset)
sens_df = pd.read_csv(attributes)
## Get and remove label Y
y_col = [str(c) for c in sens_df.columns if sens_df[c][0] == 2]
print('label feature: {}'.format(y_col))
if (len(y_col) > 1):
raise ValueError('More than 1 label column used')
if (len(y_col) < 1):
raise ValueError('No label column used')
y = df[y_col[0]]
## Do not use labels in rest of data
X = df.loc[:, df.columns != y_col[0]]
X = X.loc[:, X.columns != 'Unnamed: 0']
## Create X_prime, by getting protected attributes
sens_cols = [str(c) for c in sens_df.columns if sens_df[c][0] == 1]
print('sensitive features: {}'.format(sens_cols))
sens_dict = {c: 1 if c in sens_cols else 0 for c in df.columns}
X, sens_dict = one_hot_code(X, sens_dict)
sens_names = [key for key in sens_dict.keys() if sens_dict[key] == 1]
print(
'there are {} sensitive features including derivative features'.format(
len(sens_names)))
X_prime = X[sens_names]
if centered:
X = center(X)
X_prime = center(X_prime)
return X, X_prime, y
def center(X):
for col in X.columns:
X.loc[:, col] = X.loc[:, col] - np.mean(X.loc[:, col])
return X
def array_to_tuple(x):
# have to cast ndarray to hashable type in get_baseline()
x = tuple([el[0] for el in x]) if x.__class__.__name__ == 'ndarray' else x
return x
def one_hot_code(df1, sens_dict):
cols = df1.columns
for c in cols:
if isinstance(df1[c][0], str):
column = df1[c]
df1 = df1.drop(c, 1)
unique_values = list(set(column))
n = len(unique_values)
if n > 2:
for i in range(n):
col_name = '{}.{}'.format(c, i)
col_i = [
1 if el == unique_values[i] else 0 for el in column
]
df1[col_name] = col_i
sens_dict[col_name] = sens_dict[c]
del sens_dict[c]
else:
col_name = c
col = [1 if el == unique_values[0] else 0 for el in column]
df1[col_name] = col
return df1, sens_dict
def extract_df_from_ds(dataset):
"""Extract data frames from Transformer Data set
Args:
:param dataset: aif360 dataset
Returns:
:return X, X_prime, y: pandas dataframes of attributes, sensitive attributes, labels
"""
X = pd.DataFrame(dataset.convert_to_dataframe()[0])
# remove labels
X = X.drop(columns=dataset.label_names)
# get sensitive attributes
X_prime = X[dataset.protected_attribute_names]
y = tuple(dataset.labels[:, 0])
return X, X_prime, y
def get_data(dataset):
# Helper for main method
"""Given name of dataset, load in the three datasets associated from the clean.py file
:param dataset:
:return:
"""
X = pd.read_csv('dataset/' + dataset + '_features.csv')
X_prime = pd.read_csv('dataset/' + dataset + '_protectedfeatures.csv')
y = pd.read_csv('dataset/' + dataset + '_labels.csv',
names=['index', 'label'])
y = y['label']
return X, X_prime, y