Spaces:
Runtime error
Runtime error
# Copyright 2019 Seth V. Neel, Michael J. Kearns, Aaron L. Roth, Zhiwei Steven Wu | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); you may not | |
# use this file except in compliance with the License. You may obtain a copy of | |
# the License at http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software distributed | |
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |
# CONDITIONS OF ANY KIND, either express or implied. See the License for the | |
# specific language governing permissions and limitations under the License. | |
"""Functions for manipulating and loading input data.""" | |
import argparse | |
import numpy as np | |
import pandas as pd | |
def setup(): | |
parser = argparse.ArgumentParser(description='Fairness Data Cleaning') | |
parser.add_argument( | |
'-n', | |
'--name', | |
type=str, | |
help='name of the to store the new datasets (Required)') | |
parser.add_argument('-d', | |
'--dataset', | |
type=str, | |
help='name of the original dataset file (Required)') | |
parser.add_argument( | |
'-a', | |
'--attributes', | |
type=str, | |
help= | |
'name of the file representing which attributes are protected (unprotected = 0, protected = 1, label = 2) (Required)' | |
) | |
parser.add_argument( | |
'-c', | |
'--centered', | |
default=False, | |
action='store_true', | |
required=False, | |
help='Include this flag to determine whether data should be centered') | |
args = parser.parse_args() | |
return [args.name, args.dataset, args.attributes, args.centered] | |
def clean_dataset(dataset, attributes, centered): | |
"""Clean a dataset, given the filename for the dataset and the filename for the attributes. | |
Args: | |
:param dataset: Filename for dataset. The dataset should be formatted such that categorical | |
variables use one-hot encoding | |
and the label should be 0/1 | |
:param attributes: Filename for the attributes of the dataset. The file should have each column name in a list, | |
and under this list should have 0 for an unprotected attribute, 1 for a protected attribute, and 2 for the | |
attribute of the label. | |
:param centered: boolean flag that determines whether to center the input covariates. | |
:return X, X_prime, y: pandas dataframes of attributes, sensitive attributes, labels | |
""" | |
df = pd.read_csv(dataset) | |
sens_df = pd.read_csv(attributes) | |
## Get and remove label Y | |
y_col = [str(c) for c in sens_df.columns if sens_df[c][0] == 2] | |
print('label feature: {}'.format(y_col)) | |
if (len(y_col) > 1): | |
raise ValueError('More than 1 label column used') | |
if (len(y_col) < 1): | |
raise ValueError('No label column used') | |
y = df[y_col[0]] | |
## Do not use labels in rest of data | |
X = df.loc[:, df.columns != y_col[0]] | |
X = X.loc[:, X.columns != 'Unnamed: 0'] | |
## Create X_prime, by getting protected attributes | |
sens_cols = [str(c) for c in sens_df.columns if sens_df[c][0] == 1] | |
print('sensitive features: {}'.format(sens_cols)) | |
sens_dict = {c: 1 if c in sens_cols else 0 for c in df.columns} | |
X, sens_dict = one_hot_code(X, sens_dict) | |
sens_names = [key for key in sens_dict.keys() if sens_dict[key] == 1] | |
print( | |
'there are {} sensitive features including derivative features'.format( | |
len(sens_names))) | |
X_prime = X[sens_names] | |
if centered: | |
X = center(X) | |
X_prime = center(X_prime) | |
return X, X_prime, y | |
def center(X): | |
for col in X.columns: | |
X.loc[:, col] = X.loc[:, col] - np.mean(X.loc[:, col]) | |
return X | |
def array_to_tuple(x): | |
# have to cast ndarray to hashable type in get_baseline() | |
x = tuple([el[0] for el in x]) if x.__class__.__name__ == 'ndarray' else x | |
return x | |
def one_hot_code(df1, sens_dict): | |
cols = df1.columns | |
for c in cols: | |
if isinstance(df1[c][0], str): | |
column = df1[c] | |
df1 = df1.drop(c, 1) | |
unique_values = list(set(column)) | |
n = len(unique_values) | |
if n > 2: | |
for i in range(n): | |
col_name = '{}.{}'.format(c, i) | |
col_i = [ | |
1 if el == unique_values[i] else 0 for el in column | |
] | |
df1[col_name] = col_i | |
sens_dict[col_name] = sens_dict[c] | |
del sens_dict[c] | |
else: | |
col_name = c | |
col = [1 if el == unique_values[0] else 0 for el in column] | |
df1[col_name] = col | |
return df1, sens_dict | |
def extract_df_from_ds(dataset): | |
"""Extract data frames from Transformer Data set | |
Args: | |
:param dataset: aif360 dataset | |
Returns: | |
:return X, X_prime, y: pandas dataframes of attributes, sensitive attributes, labels | |
""" | |
X = pd.DataFrame(dataset.convert_to_dataframe()[0]) | |
# remove labels | |
X = X.drop(columns=dataset.label_names) | |
# get sensitive attributes | |
X_prime = X[dataset.protected_attribute_names] | |
y = tuple(dataset.labels[:, 0]) | |
return X, X_prime, y | |
def get_data(dataset): | |
# Helper for main method | |
"""Given name of dataset, load in the three datasets associated from the clean.py file | |
:param dataset: | |
:return: | |
""" | |
X = pd.read_csv('dataset/' + dataset + '_features.csv') | |
X_prime = pd.read_csv('dataset/' + dataset + '_protectedfeatures.csv') | |
y = pd.read_csv('dataset/' + dataset + '_labels.csv', | |
names=['index', 'label']) | |
y = y['label'] | |
return X, X_prime, y | |