kritsg's picture
testing embedding html vids
e572b7a
raw
history blame
7.32 kB
"""Routines for processing data."""
import numpy as np
import os
import pandas as pd
from PIL import Image
from skimage.segmentation import slic, mark_boundaries
import torch
from torchvision import datasets, transforms
# The number of segments to use for the images
NSEGMENTS = 20
PARAMS = {
'protected_class': 1,
'unprotected_class': 0,
'positive_outcome': 1,
'negative_outcome': 0
}
IMAGENET_LABELS = {
'french_bulldog': 245,
'scuba_diver': 983,
'corn': 987,
'broccoli': 927
}
def get_and_preprocess_compas_data():
"""Handle processing of COMPAS according to: https://github.com/propublica/compas-analysis
Parameters
----------
params : Params
Returns
----------
Pandas data frame X of processed data, np.ndarray y, and list of column names
"""
PROTECTED_CLASS = PARAMS['protected_class']
UNPROTECTED_CLASS = PARAMS['unprotected_class']
POSITIVE_OUTCOME = PARAMS['positive_outcome']
NEGATIVE_OUTCOME = PARAMS['negative_outcome']
compas_df = pd.read_csv("./data/compas-scores-two-years.csv", index_col=0)
compas_df = compas_df.loc[(compas_df['days_b_screening_arrest'] <= 30) &
(compas_df['days_b_screening_arrest'] >= -30) &
(compas_df['is_recid'] != -1) &
(compas_df['c_charge_degree'] != "O") &
(compas_df['score_text'] != "NA")]
compas_df['length_of_stay'] = (pd.to_datetime(compas_df['c_jail_out']) - pd.to_datetime(compas_df['c_jail_in'])).dt.days
X = compas_df[['age', 'two_year_recid','c_charge_degree', 'race', 'sex', 'priors_count', 'length_of_stay']]
# if person has high score give them the _negative_ model outcome
y = np.array([NEGATIVE_OUTCOME if score == 'High' else POSITIVE_OUTCOME for score in compas_df['score_text']])
sens = X.pop('race')
# assign African-American as the protected class
X = pd.get_dummies(X)
sensitive_attr = np.array(pd.get_dummies(sens).pop('African-American'))
X['race'] = sensitive_attr
# make sure everything is lining up
assert all((sens == 'African-American') == (X['race'] == PROTECTED_CLASS))
cols = [col for col in X]
categorical_features = [1, 4, 5, 6, 7, 8]
output = {
"X": X.values,
"y": y,
"column_names": cols,
"cat_indices": categorical_features
}
return output
def get_and_preprocess_german():
""""Handle processing of German. We use a preprocessed version of German from Ustun et. al.
https://arxiv.org/abs/1809.06514. Thanks Berk!
Parameters:
----------
params : Params
Returns:
----------
Pandas data frame X of processed data, np.ndarray y, and list of column names
"""
PROTECTED_CLASS = PARAMS['protected_class']
UNPROTECTED_CLASS = PARAMS['unprotected_class']
POSITIVE_OUTCOME = PARAMS['positive_outcome']
NEGATIVE_OUTCOME = PARAMS['negative_outcome']
X = pd.read_csv("./data/german_processed.csv")
y = X["GoodCustomer"]
X = X.drop(["GoodCustomer", "PurposeOfLoan"], axis=1)
X['Gender'] = [1 if v == "Male" else 0 for v in X['Gender'].values]
y = np.array([POSITIVE_OUTCOME if p == 1 else NEGATIVE_OUTCOME for p in y.values])
categorical_features = [0, 1, 2] + list(range(9, X.shape[1]))
output = {
"X": X.values,
"y": y,
"column_names": [c for c in X],
"cat_indices": categorical_features,
}
return output
def get_PIL_transf():
"""Gets the PIL image transformation."""
transf = transforms.Compose([
transforms.Resize((256, 256)),
transforms.CenterCrop(224)
])
return transf
def load_image(path):
"""Loads an image by path."""
with open(os.path.abspath(path), 'rb') as f:
with Image.open(f) as img:
return img.convert('RGB')
def get_imagenet(name, get_label=True):
"""Gets the imagenet data.
Arguments:
name: The name of the imagenet dataset
"""
images_paths = []
# Store all the paths of the images
data_dir = os.path.join("./data", name)
for (dirpath, dirnames, filenames) in os.walk(data_dir):
for fn in filenames:
if fn != ".DS_Store":
images_paths.append(os.path.join(dirpath, fn))
# Load & do transforms for the images
pill_transf = get_PIL_transf()
images, segs = [], []
for img_path in images_paths:
img = load_image(img_path)
PIL_transformed_image = np.array(pill_transf(img))
segments = slic(PIL_transformed_image, n_segments=NSEGMENTS, compactness=100, sigma=1)
images.append(PIL_transformed_image)
segs.append(segments)
images = np.array(images)
if get_label:
assert name in IMAGENET_LABELS, "Get label set to True but name not in known imagenet labels"
y = np.ones(images.shape[0]) * IMAGENET_LABELS[name]
else:
y = np.ones(images.shape[0]) * -1
segs = np.array(segs)
output = {
"X": images,
"y": y,
"segments": segs
}
return output
def get_mnist(num):
"""Gets the MNIST data for a certain digit.
Arguments:
num: The mnist digit to get
"""
# Get the mnist data
test_loader = torch.utils.data.DataLoader(datasets.MNIST('./data/mnist',
train=False,
download=True,
transform=transforms.Compose([transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=1,
shuffle=False)
all_test_mnist_of_label_num, all_test_segments_of_label_num = [], []
# Get all instances of label num
for data, y in test_loader:
if y[0] == num:
# Apply segmentation
sample = np.squeeze(data.numpy().astype('double'),axis=0)
segments = slic(sample.reshape(28,28,1), n_segments=NSEGMENTS, compactness=1, sigma=0.1).reshape(1,28,28)
all_test_mnist_of_label_num.append(sample)
all_test_segments_of_label_num.append(segments)
all_test_mnist_of_label_num = np.array(all_test_mnist_of_label_num)
all_test_segments_of_label_num = np.array(all_test_segments_of_label_num)
output = {
"X": all_test_mnist_of_label_num,
"y": np.ones(all_test_mnist_of_label_num.shape[0]) * num,
"segments": all_test_segments_of_label_num
}
return output
def get_dataset_by_name(name, get_label=True):
if name == "compas":
d = get_and_preprocess_compas_data()
elif name == "german":
d = get_and_preprocess_german()
elif "mnist" in name:
d = get_mnist(int(name[-1]))
elif "imagenet" in name:
d = get_imagenet(name[9:], get_label=get_label)
else:
raise NameError("Unkown dataset %s", name)
d['name'] = name
return d