Spaces:
Runtime error
Runtime error
import os | |
import pandas as pd | |
from aif360.datasets import StandardDataset | |
default_mappings = { | |
'label_maps': [{1.0: '>50K', 0.0: '<=50K'}], | |
'protected_attribute_maps': [{1.0: 'White', 0.0: 'Non-white'}, | |
{1.0: 'Male', 0.0: 'Female'}] | |
} | |
class AdultDataset(StandardDataset): | |
"""Adult Census Income Dataset. | |
See :file:`aif360/data/raw/adult/README.md`. | |
""" | |
def __init__(self, label_name='income-per-year', | |
favorable_classes=['>50K', '>50K.'], | |
protected_attribute_names=['race', 'sex'], | |
privileged_classes=[['White'], ['Male']], | |
instance_weights_name=None, | |
categorical_features=['workclass', 'education', | |
'marital-status', 'occupation', 'relationship', | |
'native-country'], | |
features_to_keep=[], features_to_drop=['fnlwgt'], | |
na_values=['?'], custom_preprocessing=None, | |
metadata=default_mappings): | |
"""See :obj:`StandardDataset` for a description of the arguments. | |
Examples: | |
The following will instantiate a dataset which uses the `fnlwgt` | |
feature: | |
>>> from aif360.datasets import AdultDataset | |
>>> ad = AdultDataset(instance_weights_name='fnlwgt', | |
... features_to_drop=[]) | |
WARNING:root:Missing Data: 3620 rows removed from dataset. | |
>>> not np.all(ad.instance_weights == 1.) | |
True | |
To instantiate a dataset which utilizes only numerical features and | |
a single protected attribute, run: | |
>>> single_protected = ['sex'] | |
>>> single_privileged = [['Male']] | |
>>> ad = AdultDataset(protected_attribute_names=single_protected, | |
... privileged_classes=single_privileged, | |
... categorical_features=[], | |
... features_to_keep=['age', 'education-num']) | |
>>> print(ad.feature_names) | |
['education-num', 'age', 'sex'] | |
>>> print(ad.label_names) | |
['income-per-year'] | |
Note: the `protected_attribute_names` and `label_name` are kept even | |
if they are not explicitly given in `features_to_keep`. | |
In some cases, it may be useful to keep track of a mapping from | |
`float -> str` for protected attributes and/or labels. If our use | |
case differs from the default, we can modify the mapping stored in | |
`metadata`: | |
>>> label_map = {1.0: '>50K', 0.0: '<=50K'} | |
>>> protected_attribute_maps = [{1.0: 'Male', 0.0: 'Female'}] | |
>>> ad = AdultDataset(protected_attribute_names=['sex'], | |
... categorical_features=['workclass', 'education', 'marital-status', | |
... 'occupation', 'relationship', 'native-country', 'race'], | |
... privileged_classes=[['Male']], metadata={'label_map': label_map, | |
... 'protected_attribute_maps': protected_attribute_maps}) | |
Note that we are now adding `race` as a `categorical_features`. | |
Now this information will stay attached to the dataset and can be | |
used for more descriptive visualizations. | |
""" | |
train_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), | |
'..', 'data', 'raw', 'adult', 'adult.data') | |
test_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), | |
'..', 'data', 'raw', 'adult', 'adult.test') | |
# as given by adult.names | |
column_names = ['age', 'workclass', 'fnlwgt', 'education', | |
'education-num', 'marital-status', 'occupation', 'relationship', | |
'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', | |
'native-country', 'income-per-year'] | |
try: | |
train = pd.read_csv(train_path, header=None, names=column_names, | |
skipinitialspace=True, na_values=na_values) | |
test = pd.read_csv(test_path, header=0, names=column_names, | |
skipinitialspace=True, na_values=na_values) | |
except IOError as err: | |
print("IOError: {}".format(err)) | |
print("To use this class, please download the following files:") | |
print("\n\thttps://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data") | |
print("\thttps://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test") | |
print("\thttps://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names") | |
print("\nand place them, as-is, in the folder:") | |
print("\n\t{}\n".format(os.path.abspath(os.path.join( | |
os.path.abspath(__file__), '..', '..', 'data', 'raw', 'adult')))) | |
import sys | |
sys.exit(1) | |
df = pd.concat([test, train], ignore_index=True) | |
super(AdultDataset, self).__init__(df=df, label_name=label_name, | |
favorable_classes=favorable_classes, | |
protected_attribute_names=protected_attribute_names, | |
privileged_classes=privileged_classes, | |
instance_weights_name=instance_weights_name, | |
categorical_features=categorical_features, | |
features_to_keep=features_to_keep, | |
features_to_drop=features_to_drop, na_values=na_values, | |
custom_preprocessing=custom_preprocessing, metadata=metadata) | |