Spaces:
Runtime error
Runtime error
import os | |
import pandas as pd | |
from aif360.sklearn.datasets.utils import standardize_dataset | |
# cache location | |
DATA_HOME_DEFAULT = os.path.join(os.path.dirname(os.path.abspath(__file__)), | |
'..', 'data', 'raw') | |
COMPAS_URL = 'https://raw.githubusercontent.com/propublica/compas-analysis/bafff5da3f2e45eca6c2d5055faad269defd135a/compas-scores-two-years.csv' | |
COMPAS_VIOLENT_URL = 'https://raw.githubusercontent.com/propublica/compas-analysis/bafff5da3f2e45eca6c2d5055faad269defd135a/compas-scores-two-years-violent.csv' | |
def fetch_compas(subset='all', *, data_home=None, cache=True, binary_race=False, | |
usecols=['sex', 'age', 'age_cat', 'race', 'juv_fel_count', | |
'juv_misd_count', 'juv_other_count', 'priors_count', | |
'c_charge_degree', 'c_charge_desc'], | |
dropcols=None, numeric_only=False, dropna=True): | |
"""Load the COMPAS Recidivism Risk Scores dataset. | |
Optionally binarizes 'race' to 'Caucasian' (privileged) or | |
'African-American' (unprivileged). The other protected attribute is 'sex' | |
('Male' is *unprivileged* and 'Female' is *privileged*). The outcome | |
variable is 'Survived' (favorable) if the person was not accused of a crime | |
within two years or 'Recidivated' (unfavorable) if they were. | |
Note: | |
The values for the 'sex' variable if numeric_only is ``True`` are 1 for | |
'Female and 0 for 'Male' -- opposite the convention of other datasets. | |
Args: | |
subset ({'all' or 'violent'}): Use the violent recidivism or full | |
version of the dataset. Note: 'violent' is not a strict subset of | |
'all' -- there are four samples in 'violent' which do not show up in | |
'all'. | |
data_home (string, optional): Specify another download and cache folder | |
for the datasets. By default all AIF360 datasets are stored in | |
'aif360/sklearn/data/raw' subfolders. | |
cache (bool): Whether to cache downloaded datasets. | |
binary_race (bool, optional): Filter only White and Black defendants. | |
usecols (single label or list-like, optional): Feature column(s) to | |
keep. All others are dropped. | |
dropcols (single label or list-like, optional): Feature column(s) to | |
drop. | |
numeric_only (bool): Drop all non-numeric feature columns. | |
dropna (bool): Drop rows with NAs. | |
Returns: | |
namedtuple: Tuple containing X and y for the COMPAS dataset accessible | |
by index or name. | |
""" | |
if subset not in {'violent', 'all'}: | |
raise ValueError("subset must be either 'violent' or 'all'; cannot be " | |
f"{subset}") | |
data_url = COMPAS_VIOLENT_URL if subset == 'violent' else COMPAS_URL | |
cache_path = os.path.join(data_home or DATA_HOME_DEFAULT, | |
os.path.basename(data_url)) | |
if cache and os.path.isfile(cache_path): | |
df = pd.read_csv(cache_path, index_col='id') | |
else: | |
df = pd.read_csv(data_url, index_col='id') | |
if cache: | |
os.makedirs(os.path.dirname(cache_path), exist_ok=True) | |
df.to_csv(cache_path) | |
# Perform the same preprocessing as the original analysis: | |
# https://github.com/propublica/compas-analysis/blob/master/Compas%20Analysis.ipynb | |
df = df[(df.days_b_screening_arrest <= 30) | |
& (df.days_b_screening_arrest >= -30) | |
& (df.is_recid != -1) | |
& (df.c_charge_degree != 'O') | |
& (df['score_text' if subset == 'all' else 'v_score_text'] != 'N/A')] | |
for col in ['sex', 'age_cat', 'race', 'c_charge_degree', 'c_charge_desc']: | |
df[col] = df[col].astype('category') | |
# Misdemeanor < Felony | |
df.c_charge_degree = df.c_charge_degree.cat.reorder_categories( | |
['M', 'F'], ordered=True) | |
# 'Less than 25' < '25 - 45' < 'Greater than 45' | |
df.age_cat = df.age_cat.cat.reorder_categories( | |
['Less than 25', '25 - 45', 'Greater than 45'], ordered=True) | |
# 'Survived' < 'Recidivated' | |
cats = ['Survived', 'Recidivated'] | |
df.two_year_recid = df.two_year_recid.replace([0, 1], cats).astype('category') | |
df.two_year_recid = df.two_year_recid.cat.set_categories(cats, ordered=True) | |
if binary_race: | |
# 'African-American' < 'Caucasian' | |
df.race = df.race.cat.set_categories(['African-American', 'Caucasian'], | |
ordered=True) | |
# 'Male' < 'Female' | |
df.sex = df.sex.astype('category').cat.reorder_categories( | |
['Male', 'Female'], ordered=True) | |
return standardize_dataset(df, prot_attr=['sex', 'race'], | |
target='two_year_recid', usecols=usecols, | |
dropcols=dropcols, numeric_only=numeric_only, | |
dropna=dropna) | |