File size: 6,548 Bytes
d2a8669
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from io import BytesIO
import os
from zipfile import ZipFile

import pandas as pd
import requests

from aif360.sklearn.datasets.utils import standardize_dataset


# cache location
DATA_HOME_DEFAULT = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                 '..', 'data', 'raw')
MEPS_URL = "https://meps.ahrq.gov/mepsweb/data_files/pufs"
PROMPT = """
By using this function you acknowledge the responsibility for reading and
abiding by any copyright/usage rules and restrictions as stated on the MEPS web
site (https://meps.ahrq.gov/data_stats/data_use.jsp).

Continue [y/n]? > """

def fetch_meps(panel, *, accept_terms=None, data_home=None, cache=True,
               usecols=['REGION', 'AGE', 'SEX', 'RACE', 'MARRY', 'FTSTU',
                        'ACTDTY', 'HONRDC', 'RTHLTH', 'MNHLTH', 'HIBPDX',
                        'CHDDX', 'ANGIDX', 'MIDX', 'OHRTDX', 'STRKDX', 'EMPHDX',
                        'CHBRON', 'CHOLDX', 'CANCERDX', 'DIABDX', 'JTPAIN',
                        'ARTHDX', 'ARTHTYPE', 'ASTHDX', 'ADHDADDX', 'PREGNT',
                        'WLKLIM', 'ACTLIM', 'SOCLIM', 'COGLIM', 'DFHEAR42',
                        'DFSEE42', 'ADSMOK42', 'PCS42', 'MCS42', 'K6SUM42',
                        'PHQ242', 'EMPST', 'POVCAT', 'INSCOV'],
               dropcols=None, numeric_only=False, dropna=True):
    """Load the Medical Expenditure Panel Survey (MEPS) dataset.

    Note:
        For descriptions of the dataset features, see the `data codebook
        <https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H181>`_.

    Args:
        panel ({19, 20, 21}): Panel number (only 19, 20, and 21 are currently
            supported).
        accept_terms (bool, optional): Bypass terms prompt. Note: by setting
            this to ``True``, you acknowledge responsibility for reading and
            accepting the MEPS usage terms.
        data_home (string, optional): Specify another download and cache folder
            for the datasets. By default all AIF360 datasets are stored in
            'aif360/sklearn/data/raw' subfolders.
        cache (bool): Whether to cache downloaded datasets.
        usecols (single label or list-like, optional): Feature column(s) to
            keep. All others are dropped.
        dropcols (single label or list-like, optional): Feature column(s) to
            drop.
        numeric_only (bool): Drop all non-numeric feature columns.
        dropna (bool): Drop rows with NAs.

    Returns:
        namedtuple: Tuple containing X and y for the MEPS dataset accessible by
        index or name.
    """
    if panel not in {19, 20, 21}:
        raise ValueError("only panels 19, 20, and 21 are currently supported.")

    fname = 'h192' if panel == 21 else 'h181'
    cache_path = os.path.join(data_home or DATA_HOME_DEFAULT, fname + '.csv')
    if cache and os.path.isfile(cache_path):
        df = pd.read_csv(cache_path)
    else:
        # skip prompt if user chooses
        accept = accept_terms or input(PROMPT)
        if accept != 'y' and accept != True:
            raise PermissionError("Terms not agreed.")
        rawz = requests.get(os.path.join(MEPS_URL, fname + 'ssp.zip')).content
        with ZipFile(BytesIO(rawz)) as zf:
            with zf.open(fname + '.ssp') as ssp:
                df = pd.read_sas(ssp, format='xport')
                # TODO: does this cause any differences?
                # reduce storage size
                df = df.apply(pd.to_numeric, errors='ignore', downcast='integer')
                if cache:
                    os.makedirs(os.path.dirname(cache_path), exist_ok=True)
                    df.to_csv(cache_path, index=None)
    # restrict to correct panel
    df = df[df['PANEL'] == panel]
    # change all 15s to 16s if panel == 21
    yr = 16 if panel == 21 else 15

    # non-Hispanic Whites are marked as WHITE; all others as NON-WHITE
    df['RACEV2X'] = (df['HISPANX'] == 2) & (df['RACEV2X'] == 1)

    # rename all columns that are panel/round-specific
    df = df.rename(columns={
        'FTSTU53X': 'FTSTU', 'ACTDTY53': 'ACTDTY', 'HONRDC53': 'HONRDC',
        'RTHLTH53': 'RTHLTH', 'MNHLTH53': 'MNHLTH', 'CHBRON53': 'CHBRON',
        'JTPAIN53': 'JTPAIN', 'PREGNT53': 'PREGNT', 'WLKLIM53': 'WLKLIM',
        'ACTLIM53': 'ACTLIM', 'SOCLIM53': 'SOCLIM', 'COGLIM53': 'COGLIM',
        'EMPST53': 'EMPST', 'REGION53': 'REGION', 'MARRY53X': 'MARRY',
        'AGE53X': 'AGE', f'POVCAT{yr}': 'POVCAT', f'INSCOV{yr}': 'INSCOV',
        f'PERWT{yr}F': 'PERWT', 'RACEV2X': 'RACE'})

    df.loc[df.AGE < 0, 'AGE'] = None  # set invalid ages to NaN
    cat_cols = ['REGION', 'SEX', 'RACE', 'MARRY', 'FTSTU', 'ACTDTY', 'HONRDC',
                'RTHLTH', 'MNHLTH', 'HIBPDX', 'CHDDX', 'ANGIDX', 'MIDX',
                'OHRTDX', 'STRKDX', 'EMPHDX', 'CHBRON', 'CHOLDX', 'CANCERDX',
                'DIABDX', 'JTPAIN', 'ARTHDX', 'ARTHTYPE', 'ASTHDX', 'ADHDADDX',
                'PREGNT', 'WLKLIM', 'ACTLIM', 'SOCLIM', 'COGLIM', 'DFHEAR42',
                'DFSEE42', 'ADSMOK42', 'PHQ242', 'EMPST', 'POVCAT', 'INSCOV',
    # NOTE: education tracking seems to have changed between panels. 'EDUYRDG'
    # was used for panel 19, 'EDUCYR' and 'HIDEG' were used for panels 20 & 21.
    # User may change usecols to include these manually.
                'EDUCYR', 'HIDEG']
    if panel == 19:
        cat_cols += ['EDUYRDG']

    for col in cat_cols:
        df[col] = df[col].astype('category')
        thresh = 0 if col in ['REGION', 'MARRY', 'ASTHDX'] else -1
        na_cats = [c for c in df[col].cat.categories if c < thresh]
        df[col] = df[col].cat.remove_categories(na_cats)  # set NaN cols to NaN

    df['SEX'] = df['SEX'].cat.rename_categories({1: 'Male', 2: 'Female'})
    df['RACE'] = df['RACE'].cat.rename_categories({False: 'Non-White', True: 'White'})
    df['RACE'] = df['RACE'].cat.reorder_categories(['Non-White', 'White'], ordered=True)

    # Compute UTILIZATION, binarize it to 0 (< 10) and 1 (>= 10)
    cols = [f'OBTOTV{yr}', f'OPTOTV{yr}', f'ERTOT{yr}', f'IPNGTD{yr}', f'HHTOTD{yr}']
    util = df[cols].sum(axis=1)
    df['UTILIZATION'] = pd.cut(util, [min(util)-1, 10, max(util)+1], right=False,
                               labels=['< 10 Visits', '>= 10 Visits'])#['low', 'high'])

    return standardize_dataset(df, prot_attr='RACE', target='UTILIZATION',
                               sample_weight='PERWT', usecols=usecols,
                               dropcols=dropcols, numeric_only=numeric_only,
                               dropna=dropna)