Spaces:

erasmopurif
/

FairUP

Runtime error

App Files Files Community

FairUP / src /aif360 /algorithms /inprocessing /gerryfair /clean.py

erasmopurif

First commit

d2a8669 about 2 years ago

raw

history blame contribute delete

5.71 kB

	# Copyright 2019 Seth V. Neel, Michael J. Kearns, Aaron L. Roth, Zhiwei Steven Wu
	#
	# Licensed under the Apache License, Version 2.0 (the "License"); you may not
	# use this file except in compliance with the License. You may obtain a copy of
	# the License at http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software distributed
	# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
	# CONDITIONS OF ANY KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations under the License.
	"""Functions for manipulating and loading input data."""
	import argparse
	import numpy as np
	import pandas as pd


	def setup():
	parser = argparse.ArgumentParser(description='Fairness Data Cleaning')
	parser.add_argument(
	'-n',
	'--name',
	type=str,
	help='name of the to store the new datasets (Required)')
	parser.add_argument('-d',
	'--dataset',
	type=str,
	help='name of the original dataset file (Required)')
	parser.add_argument(
	'-a',
	'--attributes',
	type=str,
	help=
	'name of the file representing which attributes are protected (unprotected = 0, protected = 1, label = 2) (Required)'
	)
	parser.add_argument(
	'-c',
	'--centered',
	default=False,
	action='store_true',
	required=False,
	help='Include this flag to determine whether data should be centered')
	args = parser.parse_args()
	return [args.name, args.dataset, args.attributes, args.centered]


	def clean_dataset(dataset, attributes, centered):
	"""Clean a dataset, given the filename for the dataset and the filename for the attributes.

	Args:
	:param dataset: Filename for dataset. The dataset should be formatted such that categorical
	variables use one-hot encoding
	and the label should be 0/1
	:param attributes: Filename for the attributes of the dataset. The file should have each column name in a list,
	and under this list should have 0 for an unprotected attribute, 1 for a protected attribute, and 2 for the
	attribute of the label.
	:param centered: boolean flag that determines whether to center the input covariates.
	:return X, X_prime, y: pandas dataframes of attributes, sensitive attributes, labels
	"""

	df = pd.read_csv(dataset)
	sens_df = pd.read_csv(attributes)

	## Get and remove label Y
	y_col = [str(c) for c in sens_df.columns if sens_df[c][0] == 2]
	print('label feature: {}'.format(y_col))
	if (len(y_col) > 1):
	raise ValueError('More than 1 label column used')
	if (len(y_col) < 1):
	raise ValueError('No label column used')

	y = df[y_col[0]]

	## Do not use labels in rest of data
	X = df.loc[:, df.columns != y_col[0]]
	X = X.loc[:, X.columns != 'Unnamed: 0']
	## Create X_prime, by getting protected attributes
	sens_cols = [str(c) for c in sens_df.columns if sens_df[c][0] == 1]
	print('sensitive features: {}'.format(sens_cols))
	sens_dict = {c: 1 if c in sens_cols else 0 for c in df.columns}
	X, sens_dict = one_hot_code(X, sens_dict)
	sens_names = [key for key in sens_dict.keys() if sens_dict[key] == 1]
	print(
	'there are {} sensitive features including derivative features'.format(
	len(sens_names)))
	X_prime = X[sens_names]
	if centered:
	X = center(X)
	X_prime = center(X_prime)
	return X, X_prime, y


	def center(X):
	for col in X.columns:
	X.loc[:, col] = X.loc[:, col] - np.mean(X.loc[:, col])
	return X


	def array_to_tuple(x):
	# have to cast ndarray to hashable type in get_baseline()
	x = tuple([el[0] for el in x]) if x.__class__.__name__ == 'ndarray' else x
	return x


	def one_hot_code(df1, sens_dict):
	cols = df1.columns
	for c in cols:
	if isinstance(df1[c][0], str):
	column = df1[c]
	df1 = df1.drop(c, 1)
	unique_values = list(set(column))
	n = len(unique_values)
	if n > 2:
	for i in range(n):
	col_name = '{}.{}'.format(c, i)
	col_i = [
	1 if el == unique_values[i] else 0 for el in column
	]
	df1[col_name] = col_i
	sens_dict[col_name] = sens_dict[c]
	del sens_dict[c]
	else:
	col_name = c
	col = [1 if el == unique_values[0] else 0 for el in column]
	df1[col_name] = col
	return df1, sens_dict


	def extract_df_from_ds(dataset):
	"""Extract data frames from Transformer Data set

	Args:
	:param dataset: aif360 dataset

	Returns:
	:return X, X_prime, y: pandas dataframes of attributes, sensitive attributes, labels
	"""

	X = pd.DataFrame(dataset.convert_to_dataframe()[0])
	# remove labels
	X = X.drop(columns=dataset.label_names)
	# get sensitive attributes
	X_prime = X[dataset.protected_attribute_names]
	y = tuple(dataset.labels[:, 0])
	return X, X_prime, y


	def get_data(dataset):
	# Helper for main method
	"""Given name of dataset, load in the three datasets associated from the clean.py file
	:param dataset:
	:return:
	"""
	X = pd.read_csv('dataset/' + dataset + '_features.csv')
	X_prime = pd.read_csv('dataset/' + dataset + '_protectedfeatures.csv')
	y = pd.read_csv('dataset/' + dataset + '_labels.csv',
	names=['index', 'label'])
	y = y['label']
	return X, X_prime, y