Spaces:

erasmopurif
/

FairUP

Runtime error

App Files Files Community

FairUP / src /aif360 /datasets /adult_dataset.py

erasmopurif

First commit

d2a8669 almost 2 years ago

raw

history blame contribute delete

5.47 kB

	import os

	import pandas as pd

	from aif360.datasets import StandardDataset


	default_mappings = {
	'label_maps': [{1.0: '>50K', 0.0: '<=50K'}],
	'protected_attribute_maps': [{1.0: 'White', 0.0: 'Non-white'},
	{1.0: 'Male', 0.0: 'Female'}]
	}

	class AdultDataset(StandardDataset):
	"""Adult Census Income Dataset.

	See :file:`aif360/data/raw/adult/README.md`.
	"""

	def __init__(self, label_name='income-per-year',
	favorable_classes=['>50K', '>50K.'],
	protected_attribute_names=['race', 'sex'],
	privileged_classes=[['White'], ['Male']],
	instance_weights_name=None,
	categorical_features=['workclass', 'education',
	'marital-status', 'occupation', 'relationship',
	'native-country'],
	features_to_keep=[], features_to_drop=['fnlwgt'],
	na_values=['?'], custom_preprocessing=None,
	metadata=default_mappings):
	"""See :obj:`StandardDataset` for a description of the arguments.

	Examples:
	The following will instantiate a dataset which uses the `fnlwgt`
	feature:

	>>> from aif360.datasets import AdultDataset
	>>> ad = AdultDataset(instance_weights_name='fnlwgt',
	... features_to_drop=[])
	WARNING:root:Missing Data: 3620 rows removed from dataset.
	>>> not np.all(ad.instance_weights == 1.)
	True

	To instantiate a dataset which utilizes only numerical features and
	a single protected attribute, run:

	>>> single_protected = ['sex']
	>>> single_privileged = [['Male']]
	>>> ad = AdultDataset(protected_attribute_names=single_protected,
	... privileged_classes=single_privileged,
	... categorical_features=[],
	... features_to_keep=['age', 'education-num'])
	>>> print(ad.feature_names)
	['education-num', 'age', 'sex']
	>>> print(ad.label_names)
	['income-per-year']

	Note: the `protected_attribute_names` and `label_name` are kept even
	if they are not explicitly given in `features_to_keep`.

	In some cases, it may be useful to keep track of a mapping from
	`float -> str` for protected attributes and/or labels. If our use
	case differs from the default, we can modify the mapping stored in
	`metadata`:

	>>> label_map = {1.0: '>50K', 0.0: '<=50K'}
	>>> protected_attribute_maps = [{1.0: 'Male', 0.0: 'Female'}]
	>>> ad = AdultDataset(protected_attribute_names=['sex'],
	... categorical_features=['workclass', 'education', 'marital-status',
	... 'occupation', 'relationship', 'native-country', 'race'],
	... privileged_classes=[['Male']], metadata={'label_map': label_map,
	... 'protected_attribute_maps': protected_attribute_maps})

	Note that we are now adding `race` as a `categorical_features`.
	Now this information will stay attached to the dataset and can be
	used for more descriptive visualizations.
	"""

	train_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
	'..', 'data', 'raw', 'adult', 'adult.data')
	test_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
	'..', 'data', 'raw', 'adult', 'adult.test')
	# as given by adult.names
	column_names = ['age', 'workclass', 'fnlwgt', 'education',
	'education-num', 'marital-status', 'occupation', 'relationship',
	'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
	'native-country', 'income-per-year']
	try:
	train = pd.read_csv(train_path, header=None, names=column_names,
	skipinitialspace=True, na_values=na_values)
	test = pd.read_csv(test_path, header=0, names=column_names,
	skipinitialspace=True, na_values=na_values)
	except IOError as err:
	print("IOError: {}".format(err))
	print("To use this class, please download the following files:")
	print("\n\thttps://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data")
	print("\thttps://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test")
	print("\thttps://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names")
	print("\nand place them, as-is, in the folder:")
	print("\n\t{}\n".format(os.path.abspath(os.path.join(
	os.path.abspath(__file__), '..', '..', 'data', 'raw', 'adult'))))
	import sys
	sys.exit(1)

	df = pd.concat([test, train], ignore_index=True)

	super(AdultDataset, self).__init__(df=df, label_name=label_name,
	favorable_classes=favorable_classes,
	protected_attribute_names=protected_attribute_names,
	privileged_classes=privileged_classes,
	instance_weights_name=instance_weights_name,
	categorical_features=categorical_features,
	features_to_keep=features_to_keep,
	features_to_drop=features_to_drop, na_values=na_values,
	custom_preprocessing=custom_preprocessing, metadata=metadata)