import os import pandas as pd from aif360.datasets import StandardDataset default_mappings = { 'label_maps': [{1.0: '>= 10 Visits', 0.0: '< 10 Visits'}], 'protected_attribute_maps': [{1.0: 'White', 0.0: 'Non-White'}] } def default_preprocessing(df): """ 1.Create a new column, RACE that is 'White' if RACEV2X = 1 and HISPANX = 2 i.e. non Hispanic White and 'non-White' otherwise 2. Restrict to Panel 20 3. RENAME all columns that are PANEL/ROUND SPECIFIC 4. Drop rows based on certain values of individual features that correspond to missing/unknown - generally < -1 5. Compute UTILIZATION, binarize it to 0 (< 10) and 1 (>= 10) """ def race(row): if ((row['HISPANX'] == 2) and (row['RACEV2X'] == 1)): #non-Hispanic Whites are marked as WHITE; all others as NON-WHITE return 'White' return 'Non-White' df['RACEV2X'] = df.apply(lambda row: race(row), axis=1) df = df.rename(columns = {'RACEV2X' : 'RACE'}) df = df[df['PANEL'] == 20] # RENAME COLUMNS df = df.rename(columns = {'FTSTU53X' : 'FTSTU', 'ACTDTY53' : 'ACTDTY', 'HONRDC53' : 'HONRDC', 'RTHLTH53' : 'RTHLTH', 'MNHLTH53' : 'MNHLTH', 'CHBRON53' : 'CHBRON', 'JTPAIN53' : 'JTPAIN', 'PREGNT53' : 'PREGNT', 'WLKLIM53' : 'WLKLIM', 'ACTLIM53' : 'ACTLIM', 'SOCLIM53' : 'SOCLIM', 'COGLIM53' : 'COGLIM', 'EMPST53' : 'EMPST', 'REGION53' : 'REGION', 'MARRY53X' : 'MARRY', 'AGE53X' : 'AGE', 'POVCAT15' : 'POVCAT', 'INSCOV15' : 'INSCOV'}) df = df[df['REGION'] >= 0] # remove values -1 df = df[df['AGE'] >= 0] # remove values -1 df = df[df['MARRY'] >= 0] # remove values -1, -7, -8, -9 df = df[df['ASTHDX'] >= 0] # remove values -1, -7, -8, -9 df = df[(df[['FTSTU','ACTDTY','HONRDC','RTHLTH','MNHLTH','HIBPDX','CHDDX','ANGIDX','EDUCYR','HIDEG', 'MIDX','OHRTDX','STRKDX','EMPHDX','CHBRON','CHOLDX','CANCERDX','DIABDX', 'JTPAIN','ARTHDX','ARTHTYPE','ASTHDX','ADHDADDX','PREGNT','WLKLIM', 'ACTLIM','SOCLIM','COGLIM','DFHEAR42','DFSEE42','ADSMOK42', 'PHQ242','EMPST','POVCAT','INSCOV']] >= -1).all(1)] #for all other categorical features, remove values < -1 def utilization(row): return row['OBTOTV15'] + row['OPTOTV15'] + row['ERTOT15'] + row['IPNGTD15'] + row['HHTOTD15'] df['TOTEXP15'] = df.apply(lambda row: utilization(row), axis=1) lessE = df['TOTEXP15'] < 10.0 df.loc[lessE,'TOTEXP15'] = 0.0 moreE = df['TOTEXP15'] >= 10.0 df.loc[moreE,'TOTEXP15'] = 1.0 df = df.rename(columns = {'TOTEXP15' : 'UTILIZATION'}) return df class MEPSDataset20(StandardDataset): """MEPS Dataset. See :file:`aif360/data/raw/meps/README.md`. """ def __init__(self, label_name='UTILIZATION', favorable_classes=[1.0], protected_attribute_names=['RACE'], privileged_classes=[['White']], instance_weights_name='PERWT15F', categorical_features=['REGION','SEX','MARRY', 'FTSTU','ACTDTY','HONRDC','RTHLTH','MNHLTH','HIBPDX','CHDDX','ANGIDX', 'MIDX','OHRTDX','STRKDX','EMPHDX','CHBRON','CHOLDX','CANCERDX','DIABDX', 'JTPAIN','ARTHDX','ARTHTYPE','ASTHDX','ADHDADDX','PREGNT','WLKLIM', 'ACTLIM','SOCLIM','COGLIM','DFHEAR42','DFSEE42', 'ADSMOK42', 'PHQ242', 'EMPST','POVCAT','INSCOV'], features_to_keep=['REGION','AGE','SEX','RACE','MARRY', 'FTSTU','ACTDTY','HONRDC','RTHLTH','MNHLTH','HIBPDX','CHDDX','ANGIDX', 'MIDX','OHRTDX','STRKDX','EMPHDX','CHBRON','CHOLDX','CANCERDX','DIABDX', 'JTPAIN','ARTHDX','ARTHTYPE','ASTHDX','ADHDADDX','PREGNT','WLKLIM', 'ACTLIM','SOCLIM','COGLIM','DFHEAR42','DFSEE42', 'ADSMOK42', 'PCS42', 'MCS42','K6SUM42','PHQ242','EMPST','POVCAT','INSCOV','UTILIZATION', 'PERWT15F'], features_to_drop=[], na_values=[], custom_preprocessing=default_preprocessing, metadata=default_mappings): filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'raw', 'meps', 'h181.csv') try: df = pd.read_csv(filepath, sep=',', na_values=na_values) except IOError as err: print("IOError: {}".format(err)) print("To use this class, please follow the instructions in:") print("\n\t{}\n".format(os.path.abspath(os.path.join( os.path.abspath(__file__), '..', '..', 'data', 'raw', 'meps', 'README.md')))) print("\n to download and convert the 2015 data and place the final h181.csv file, as-is, in the folder:") print("\n\t{}\n".format(os.path.abspath(os.path.join( os.path.abspath(__file__), '..', '..', 'data', 'raw', 'meps')))) import sys sys.exit(1) super(MEPSDataset20, self).__init__(df=df, label_name=label_name, favorable_classes=favorable_classes, protected_attribute_names=protected_attribute_names, privileged_classes=privileged_classes, instance_weights_name=instance_weights_name, categorical_features=categorical_features, features_to_keep=features_to_keep, features_to_drop=features_to_drop, na_values=na_values, custom_preprocessing=custom_preprocessing, metadata=metadata)