Spaces:
Runtime error
Runtime error
File size: 6,548 Bytes
d2a8669 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
from io import BytesIO
import os
from zipfile import ZipFile
import pandas as pd
import requests
from aif360.sklearn.datasets.utils import standardize_dataset
# cache location
DATA_HOME_DEFAULT = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'..', 'data', 'raw')
MEPS_URL = "https://meps.ahrq.gov/mepsweb/data_files/pufs"
PROMPT = """
By using this function you acknowledge the responsibility for reading and
abiding by any copyright/usage rules and restrictions as stated on the MEPS web
site (https://meps.ahrq.gov/data_stats/data_use.jsp).
Continue [y/n]? > """
def fetch_meps(panel, *, accept_terms=None, data_home=None, cache=True,
usecols=['REGION', 'AGE', 'SEX', 'RACE', 'MARRY', 'FTSTU',
'ACTDTY', 'HONRDC', 'RTHLTH', 'MNHLTH', 'HIBPDX',
'CHDDX', 'ANGIDX', 'MIDX', 'OHRTDX', 'STRKDX', 'EMPHDX',
'CHBRON', 'CHOLDX', 'CANCERDX', 'DIABDX', 'JTPAIN',
'ARTHDX', 'ARTHTYPE', 'ASTHDX', 'ADHDADDX', 'PREGNT',
'WLKLIM', 'ACTLIM', 'SOCLIM', 'COGLIM', 'DFHEAR42',
'DFSEE42', 'ADSMOK42', 'PCS42', 'MCS42', 'K6SUM42',
'PHQ242', 'EMPST', 'POVCAT', 'INSCOV'],
dropcols=None, numeric_only=False, dropna=True):
"""Load the Medical Expenditure Panel Survey (MEPS) dataset.
Note:
For descriptions of the dataset features, see the `data codebook
<https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H181>`_.
Args:
panel ({19, 20, 21}): Panel number (only 19, 20, and 21 are currently
supported).
accept_terms (bool, optional): Bypass terms prompt. Note: by setting
this to ``True``, you acknowledge responsibility for reading and
accepting the MEPS usage terms.
data_home (string, optional): Specify another download and cache folder
for the datasets. By default all AIF360 datasets are stored in
'aif360/sklearn/data/raw' subfolders.
cache (bool): Whether to cache downloaded datasets.
usecols (single label or list-like, optional): Feature column(s) to
keep. All others are dropped.
dropcols (single label or list-like, optional): Feature column(s) to
drop.
numeric_only (bool): Drop all non-numeric feature columns.
dropna (bool): Drop rows with NAs.
Returns:
namedtuple: Tuple containing X and y for the MEPS dataset accessible by
index or name.
"""
if panel not in {19, 20, 21}:
raise ValueError("only panels 19, 20, and 21 are currently supported.")
fname = 'h192' if panel == 21 else 'h181'
cache_path = os.path.join(data_home or DATA_HOME_DEFAULT, fname + '.csv')
if cache and os.path.isfile(cache_path):
df = pd.read_csv(cache_path)
else:
# skip prompt if user chooses
accept = accept_terms or input(PROMPT)
if accept != 'y' and accept != True:
raise PermissionError("Terms not agreed.")
rawz = requests.get(os.path.join(MEPS_URL, fname + 'ssp.zip')).content
with ZipFile(BytesIO(rawz)) as zf:
with zf.open(fname + '.ssp') as ssp:
df = pd.read_sas(ssp, format='xport')
# TODO: does this cause any differences?
# reduce storage size
df = df.apply(pd.to_numeric, errors='ignore', downcast='integer')
if cache:
os.makedirs(os.path.dirname(cache_path), exist_ok=True)
df.to_csv(cache_path, index=None)
# restrict to correct panel
df = df[df['PANEL'] == panel]
# change all 15s to 16s if panel == 21
yr = 16 if panel == 21 else 15
# non-Hispanic Whites are marked as WHITE; all others as NON-WHITE
df['RACEV2X'] = (df['HISPANX'] == 2) & (df['RACEV2X'] == 1)
# rename all columns that are panel/round-specific
df = df.rename(columns={
'FTSTU53X': 'FTSTU', 'ACTDTY53': 'ACTDTY', 'HONRDC53': 'HONRDC',
'RTHLTH53': 'RTHLTH', 'MNHLTH53': 'MNHLTH', 'CHBRON53': 'CHBRON',
'JTPAIN53': 'JTPAIN', 'PREGNT53': 'PREGNT', 'WLKLIM53': 'WLKLIM',
'ACTLIM53': 'ACTLIM', 'SOCLIM53': 'SOCLIM', 'COGLIM53': 'COGLIM',
'EMPST53': 'EMPST', 'REGION53': 'REGION', 'MARRY53X': 'MARRY',
'AGE53X': 'AGE', f'POVCAT{yr}': 'POVCAT', f'INSCOV{yr}': 'INSCOV',
f'PERWT{yr}F': 'PERWT', 'RACEV2X': 'RACE'})
df.loc[df.AGE < 0, 'AGE'] = None # set invalid ages to NaN
cat_cols = ['REGION', 'SEX', 'RACE', 'MARRY', 'FTSTU', 'ACTDTY', 'HONRDC',
'RTHLTH', 'MNHLTH', 'HIBPDX', 'CHDDX', 'ANGIDX', 'MIDX',
'OHRTDX', 'STRKDX', 'EMPHDX', 'CHBRON', 'CHOLDX', 'CANCERDX',
'DIABDX', 'JTPAIN', 'ARTHDX', 'ARTHTYPE', 'ASTHDX', 'ADHDADDX',
'PREGNT', 'WLKLIM', 'ACTLIM', 'SOCLIM', 'COGLIM', 'DFHEAR42',
'DFSEE42', 'ADSMOK42', 'PHQ242', 'EMPST', 'POVCAT', 'INSCOV',
# NOTE: education tracking seems to have changed between panels. 'EDUYRDG'
# was used for panel 19, 'EDUCYR' and 'HIDEG' were used for panels 20 & 21.
# User may change usecols to include these manually.
'EDUCYR', 'HIDEG']
if panel == 19:
cat_cols += ['EDUYRDG']
for col in cat_cols:
df[col] = df[col].astype('category')
thresh = 0 if col in ['REGION', 'MARRY', 'ASTHDX'] else -1
na_cats = [c for c in df[col].cat.categories if c < thresh]
df[col] = df[col].cat.remove_categories(na_cats) # set NaN cols to NaN
df['SEX'] = df['SEX'].cat.rename_categories({1: 'Male', 2: 'Female'})
df['RACE'] = df['RACE'].cat.rename_categories({False: 'Non-White', True: 'White'})
df['RACE'] = df['RACE'].cat.reorder_categories(['Non-White', 'White'], ordered=True)
# Compute UTILIZATION, binarize it to 0 (< 10) and 1 (>= 10)
cols = [f'OBTOTV{yr}', f'OPTOTV{yr}', f'ERTOT{yr}', f'IPNGTD{yr}', f'HHTOTD{yr}']
util = df[cols].sum(axis=1)
df['UTILIZATION'] = pd.cut(util, [min(util)-1, 10, max(util)+1], right=False,
labels=['< 10 Visits', '>= 10 Visits'])#['low', 'high'])
return standardize_dataset(df, prot_attr='RACE', target='UTILIZATION',
sample_weight='PERWT', usecols=usecols,
dropcols=dropcols, numeric_only=numeric_only,
dropna=dropna)
|