FairUP / src /aif360 /datasets /meps_dataset_panel19_fy2015.py
erasmopurif's picture
First commit
d2a8669
import os
import pandas as pd
from aif360.datasets import StandardDataset
default_mappings = {
'label_maps': [{1.0: '>= 10 Visits', 0.0: '< 10 Visits'}],
'protected_attribute_maps': [{1.0: 'White', 0.0: 'Non-White'}]
}
def default_preprocessing(df):
"""
1.Create a new column, RACE that is 'White' if RACEV2X = 1 and HISPANX = 2 i.e. non Hispanic White
and 'non-White' otherwise
2. Restrict to Panel 19
3. RENAME all columns that are PANEL/ROUND SPECIFIC
4. Drop rows based on certain values of individual features that correspond to missing/unknown - generally < -1
5. Compute UTILIZATION, binarize it to 0 (< 10) and 1 (>= 10)
"""
def race(row):
if ((row['HISPANX'] == 2) and (row['RACEV2X'] == 1)): #non-Hispanic Whites are marked as WHITE; all others as NON-WHITE
return 'White'
return 'Non-White'
df['RACEV2X'] = df.apply(lambda row: race(row), axis=1)
df = df.rename(columns = {'RACEV2X' : 'RACE'})
df = df[df['PANEL'] == 19]
# RENAME COLUMNS
df = df.rename(columns = {'FTSTU53X' : 'FTSTU', 'ACTDTY53' : 'ACTDTY', 'HONRDC53' : 'HONRDC', 'RTHLTH53' : 'RTHLTH',
'MNHLTH53' : 'MNHLTH', 'CHBRON53' : 'CHBRON', 'JTPAIN53' : 'JTPAIN', 'PREGNT53' : 'PREGNT',
'WLKLIM53' : 'WLKLIM', 'ACTLIM53' : 'ACTLIM', 'SOCLIM53' : 'SOCLIM', 'COGLIM53' : 'COGLIM',
'EMPST53' : 'EMPST', 'REGION53' : 'REGION', 'MARRY53X' : 'MARRY', 'AGE53X' : 'AGE',
'POVCAT15' : 'POVCAT', 'INSCOV15' : 'INSCOV'})
df = df[df['REGION'] >= 0] # remove values -1
df = df[df['AGE'] >= 0] # remove values -1
df = df[df['MARRY'] >= 0] # remove values -1, -7, -8, -9
df = df[df['ASTHDX'] >= 0] # remove values -1, -7, -8, -9
df = df[(df[['FTSTU','ACTDTY','HONRDC','RTHLTH','MNHLTH','HIBPDX','CHDDX','ANGIDX','EDUCYR','HIDEG',
'MIDX','OHRTDX','STRKDX','EMPHDX','CHBRON','CHOLDX','CANCERDX','DIABDX',
'JTPAIN','ARTHDX','ARTHTYPE','ASTHDX','ADHDADDX','PREGNT','WLKLIM',
'ACTLIM','SOCLIM','COGLIM','DFHEAR42','DFSEE42','ADSMOK42',
'PHQ242','EMPST','POVCAT','INSCOV']] >= -1).all(1)] #for all other categorical features, remove values < -1
def utilization(row):
return row['OBTOTV15'] + row['OPTOTV15'] + row['ERTOT15'] + row['IPNGTD15'] + row['HHTOTD15']
df['TOTEXP15'] = df.apply(lambda row: utilization(row), axis=1)
lessE = df['TOTEXP15'] < 10.0
df.loc[lessE,'TOTEXP15'] = 0.0
moreE = df['TOTEXP15'] >= 10.0
df.loc[moreE,'TOTEXP15'] = 1.0
df = df.rename(columns = {'TOTEXP15' : 'UTILIZATION'})
return df
class MEPSDataset19(StandardDataset):
"""MEPS Dataset.
See :file:`aif360/data/raw/meps/README.md`.
"""
def __init__(self, label_name='UTILIZATION', favorable_classes=[1.0],
protected_attribute_names=['RACE'],
privileged_classes=[['White']],
instance_weights_name='PERWT15F',
categorical_features=['REGION','SEX','MARRY',
'FTSTU','ACTDTY','HONRDC','RTHLTH','MNHLTH','HIBPDX','CHDDX','ANGIDX',
'MIDX','OHRTDX','STRKDX','EMPHDX','CHBRON','CHOLDX','CANCERDX','DIABDX',
'JTPAIN','ARTHDX','ARTHTYPE','ASTHDX','ADHDADDX','PREGNT','WLKLIM',
'ACTLIM','SOCLIM','COGLIM','DFHEAR42','DFSEE42','ADSMOK42',
'PHQ242','EMPST','POVCAT','INSCOV'],
features_to_keep=['REGION','AGE','SEX','RACE','MARRY',
'FTSTU','ACTDTY','HONRDC','RTHLTH','MNHLTH','HIBPDX','CHDDX','ANGIDX',
'MIDX','OHRTDX','STRKDX','EMPHDX','CHBRON','CHOLDX','CANCERDX','DIABDX',
'JTPAIN','ARTHDX','ARTHTYPE','ASTHDX','ADHDADDX','PREGNT','WLKLIM',
'ACTLIM','SOCLIM','COGLIM','DFHEAR42','DFSEE42','ADSMOK42','PCS42',
'MCS42','K6SUM42','PHQ242','EMPST','POVCAT','INSCOV','UTILIZATION','PERWT15F'],
features_to_drop=[],
na_values=[], custom_preprocessing=default_preprocessing,
metadata=default_mappings):
filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'..', 'data', 'raw', 'meps', 'h181.csv')
try:
df = pd.read_csv(filepath, sep=',', na_values=na_values)
except IOError as err:
print("IOError: {}".format(err))
print("To use this class, please follow the instructions in:")
print("\n\t{}\n".format(os.path.abspath(os.path.join(
os.path.abspath(__file__), '..', '..', 'data', 'raw', 'meps', 'README.md'))))
print("\n to download and convert the 2015 data and place the final h181.csv file, as-is, in the folder:")
print("\n\t{}\n".format(os.path.abspath(os.path.join(
os.path.abspath(__file__), '..', '..', 'data', 'raw', 'meps'))))
import sys
sys.exit(1)
super(MEPSDataset19, self).__init__(df=df, label_name=label_name,
favorable_classes=favorable_classes,
protected_attribute_names=protected_attribute_names,
privileged_classes=privileged_classes,
instance_weights_name=instance_weights_name,
categorical_features=categorical_features,
features_to_keep=features_to_keep,
features_to_drop=features_to_drop, na_values=na_values,
custom_preprocessing=custom_preprocessing, metadata=metadata)