FairUP / src /aif360 /sklearn /inprocessing /adversarial_debiasing.py
erasmopurif's picture
First commit
d2a8669
import numpy as np
import scipy.special
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import check_random_state
from sklearn.utils.validation import check_is_fitted
try:
import tensorflow.compat.v1 as tf
except ImportError as error:
from logging import warning
warning("{}: AdversarialDebiasing will be unavailable. To install, run:\n"
"pip install 'aif360[AdversarialDebiasing]'".format(error))
from aif360.sklearn.utils import check_inputs, check_groups
class AdversarialDebiasing(BaseEstimator, ClassifierMixin):
"""Debiasing with adversarial learning.
Adversarial debiasing is an in-processing technique that learns a
classifier to maximize prediction accuracy and simultaneously reduce an
adversary's ability to determine the protected attribute from the
predictions [#zhang18]_. This approach leads to a fair classifier as the
predictions cannot carry any group discrimination information that the
adversary can exploit.
References:
.. [#zhang18] `B. H. Zhang, B. Lemoine, and M. Mitchell, "Mitigating
Unwanted Biases with Adversarial Learning," AAAI/ACM Conference on
Artificial Intelligence, Ethics, and Society, 2018.
<https://dl.acm.org/citation.cfm?id=3278779>`_
Attributes:
prot_attr_ (str or list(str)): Protected attribute(s) used for
debiasing.
groups_ (array, shape (n_groups,)): A list of group labels known to the
classifier.
classes_ (array, shape (n_classes,)): A list of class labels known to
the classifier.
sess_ (tensorflow.Session): The TensorFlow Session used for the
computations. Note: this can be manually closed to free up resources
with `self.sess_.close()`.
classifier_logits_ (tensorflow.Tensor): Tensor containing output logits
from the classifier.
adversary_logits_ (tensorflow.Tensor): Tensor containing output logits
from the adversary.
"""
def __init__(self, prot_attr=None, scope_name='classifier',
adversary_loss_weight=0.1, num_epochs=50, batch_size=128,
classifier_num_hidden_units=200, debias=True, verbose=False,
random_state=None):
r"""
Args:
prot_attr (single label or list-like, optional): Protected
attribute(s) to use in the debiasing process. If more than one
attribute, all combinations of values (intersections) are
considered. Default is ``None`` meaning all protected attributes
from the dataset are used.
scope_name (str, optional): TensorFlow "variable_scope" name for the
entire model (classifier and adversary).
adversary_loss_weight (float or ``None``, optional): If ``None``,
this will use the suggestion from the paper:
:math:`\alpha = \sqrt{global\_step}` with inverse time decay on
the learning rate. Otherwise, it uses the provided coefficient
with exponential learning rate decay.
num_epochs (int, optional): Number of epochs for which to train.
batch_size (int, optional): Size of mini-batch for training.
classifier_num_hidden_units (int, optional): Number of hidden units
in the classifier.
debias (bool, optional): If ``False``, learn a classifier without an
adversary.
verbose (bool, optional): If ``True``, print losses every 200 steps.
random_state (int or numpy.RandomState, optional): Seed of pseudo-
random number generator for shuffling data and seeding weights.
"""
self.prot_attr = prot_attr
self.scope_name = scope_name
self.adversary_loss_weight = adversary_loss_weight
self.num_epochs = num_epochs
self.batch_size = batch_size
self.classifier_num_hidden_units = classifier_num_hidden_units
self.debias = debias
self.verbose = verbose
self.random_state = random_state
def fit(self, X, y):
"""Train the classifier and adversary (if ``debias == True``) with the
given training data.
Args:
X (pandas.DataFrame): Training samples.
y (array-like): Training labels.
Returns:
self
"""
if tf.executing_eagerly():
raise RuntimeError("AdversarialDebiasing does not work in eager "
"execution mode. To fix, add `tf.disable_eager_execution()`"
" to the top of the calling script.")
X, y, _ = check_inputs(X, y)
rng = check_random_state(self.random_state)
ii32 = np.iinfo(np.int32)
s1, s2, s3, s4 = rng.randint(ii32.min, ii32.max, size=4)
tf.reset_default_graph()
self.sess_ = tf.Session()
groups, self.prot_attr_ = check_groups(X, self.prot_attr)
le = LabelEncoder()
y = le.fit_transform(y)
self.classes_ = le.classes_
# BUG: LabelEncoder converts to ndarray which removes tuple formatting
groups = groups.map(str)
groups = le.fit_transform(groups)
self.groups_ = le.classes_
n_classes = len(self.classes_)
n_groups = len(self.groups_)
# use sigmoid for binary case
if n_classes == 2:
n_classes = 1
if n_groups == 2:
n_groups = 1
n_samples, n_features = X.shape
with tf.variable_scope(self.scope_name):
# Setup placeholders
self.input_ph = tf.placeholder(tf.float32, shape=[None, n_features])
self.prot_attr_ph = tf.placeholder(tf.float32, shape=[None, 1])
self.true_labels_ph = tf.placeholder(tf.float32, shape=[None, 1])
self.keep_prob = tf.placeholder(tf.float32)
# Create classifier
with tf.variable_scope('classifier_model'):
W1 = tf.get_variable(
'W1', [n_features, self.classifier_num_hidden_units],
initializer=tf.initializers.glorot_uniform(seed=s1))
b1 = tf.Variable(tf.zeros(
shape=[self.classifier_num_hidden_units]), name='b1')
h1 = tf.nn.relu(tf.matmul(self.input_ph, W1) + b1)
h1 = tf.nn.dropout(h1, rate=1-self.keep_prob, seed=s2)
W2 = tf.get_variable(
'W2', [self.classifier_num_hidden_units, n_classes],
initializer=tf.initializers.glorot_uniform(seed=s3))
b2 = tf.Variable(tf.zeros(shape=[n_classes]), name='b2')
self.classifier_logits_ = tf.matmul(h1, W2) + b2
# Obtain classifier loss
if self.classifier_logits_.shape[1] == 1:
clf_loss = tf.reduce_mean(
tf.nn.sigmoid_cross_entropy_with_logits(
labels=self.true_labels_ph,
logits=self.classifier_logits_))
else:
clf_loss = tf.reduce_mean(
tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=tf.squeeze(tf.cast(self.true_labels_ph,
tf.int32)),
logits=self.classifier_logits_))
if self.debias:
# Create adversary
with tf.variable_scope("adversary_model"):
c = tf.get_variable('c', initializer=tf.constant(1.0))
s = tf.sigmoid((1 + tf.abs(c)) * self.classifier_logits_)
W2 = tf.get_variable('W2', [3, n_groups],
initializer=tf.initializers.glorot_uniform(seed=s4))
b2 = tf.Variable(tf.zeros(shape=[n_groups]), name='b2')
self.adversary_logits_ = tf.matmul(
tf.concat([s, s * self.true_labels_ph,
s * (1. - self.true_labels_ph)], axis=1),
W2) + b2
# Obtain adversary loss
if self.adversary_logits_.shape[1] == 1:
adv_loss = tf.reduce_mean(
tf.nn.sigmoid_cross_entropy_with_logits(
labels=self.prot_attr_ph,
logits=self.adversary_logits_))
else:
adv_loss = tf.reduce_mean(
tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=tf.squeeze(tf.cast(self.prot_attr_ph,
tf.int32)),
logits=self.adversary_logits_))
global_step = tf.Variable(0., trainable=False)
init_learning_rate = 0.001
if self.adversary_loss_weight is not None:
learning_rate = tf.train.exponential_decay(init_learning_rate,
global_step, 1000, 0.96, staircase=True)
else:
learning_rate = tf.train.inverse_time_decay(init_learning_rate,
global_step, 1000, 0.1, staircase=True)
# Setup optimizers
clf_opt = tf.train.AdamOptimizer(learning_rate)
if self.debias:
adv_opt = tf.train.AdamOptimizer(learning_rate)
clf_vars = [var for var in tf.trainable_variables()
if 'classifier_model' in var.name]
if self.debias:
adv_vars = [var for var in tf.trainable_variables()
if 'adversary_model' in var.name]
# Compute grad wrt classifier parameters
adv_grads = {var: grad for (grad, var) in
adv_opt.compute_gradients(adv_loss, var_list=clf_vars)}
normalize = lambda x: x / (tf.norm(x) + np.finfo(np.float32).tiny)
clf_grads = []
for (grad, var) in clf_opt.compute_gradients(clf_loss,
var_list=clf_vars):
if self.debias:
unit_adv_grad = normalize(adv_grads[var])
# proj_{adv_grad} clf_grad:
grad -= tf.reduce_sum(grad * unit_adv_grad) * unit_adv_grad
if self.adversary_loss_weight is not None:
grad -= self.adversary_loss_weight * adv_grads[var]
else:
grad -= tf.sqrt(global_step) * adv_grads[var]
clf_grads.append((grad, var))
clf_min = clf_opt.apply_gradients(clf_grads,
global_step=global_step)
if self.debias:
with tf.control_dependencies([clf_min]):
adv_min = adv_opt.minimize(adv_loss, var_list=adv_vars)
self.sess_.run(tf.global_variables_initializer())
# Begin training
for epoch in range(self.num_epochs):
shuffled_ids = rng.permutation(n_samples)
for i in range(n_samples // self.batch_size):
batch_ids = shuffled_ids[self.batch_size * i:
self.batch_size * (i+1)]
batch_features = X.iloc[batch_ids]
batch_labels = y[batch_ids][:, np.newaxis]
batch_prot_attr = groups[batch_ids][:, np.newaxis]
batch_feed_dict = {self.input_ph: batch_features,
self.true_labels_ph: batch_labels,
self.prot_attr_ph: batch_prot_attr,
self.keep_prob: 0.8}
if self.debias:
_, _, clf_loss_val, adv_loss_val = self.sess_.run(
[clf_min, adv_min, clf_loss, adv_loss],
feed_dict=batch_feed_dict)
if i % 200 == 0 and self.verbose:
print("epoch {:>3d}; iter: {:>4d}; batch classifier"
" loss: {:.4f}; batch adversarial loss: "
"{:.4f}".format(epoch, i, clf_loss_val,
adv_loss_val))
else:
_, clf_loss_val = self.sess_.run([clf_min, clf_loss],
feed_dict=batch_feed_dict)
if i % 200 == 0 and self.verbose:
print("epoch {:>3d}; iter: {:>4d}; batch classifier"
" loss: {:.4f}".format(epoch, i,
clf_loss_val))
return self
def decision_function(self, X):
"""Soft prediction scores.
Args:
X (pandas.DataFrame): Test samples.
Returns:
numpy.ndarray: Confidence scores per (sample, class) combination. In
the binary case, confidence score for ``self.classes_[1]`` where >0
means this class would be predicted.
"""
check_is_fitted(self, ['classes_', 'input_ph', 'keep_prob',
'classifier_logits_'])
n_samples = X.shape[0]
n_classes = len(self.classes_)
if n_classes == 2:
n_classes = 1
samples_covered = 0
scores = np.empty((n_samples, n_classes))
while samples_covered < n_samples:
start = samples_covered
end = samples_covered + self.batch_size
if end > n_samples:
end = n_samples
batch_ids = np.arange(start, end)
batch_features = X.iloc[batch_ids]
batch_feed_dict = {self.input_ph: batch_features,
self.keep_prob: 1.0}
scores[batch_ids] = self.sess_.run(self.classifier_logits_,
feed_dict=batch_feed_dict)
samples_covered += len(batch_features)
return scores.ravel() if scores.shape[1] == 1 else scores
def predict_proba(self, X):
"""Probability estimates.
The returned estimates for all classes are ordered by the label of
classes.
Args:
X (pandas.DataFrame): Test samples.
Returns:
numpy.ndarray: Returns the probability of the sample for each class
in the model, where classes are ordered as they are in
``self.classes_``.
"""
decision = self.decision_function(X)
if decision.ndim == 1:
decision_2d = np.c_[np.zeros_like(decision), decision]
else:
decision_2d = decision
return scipy.special.softmax(decision_2d, axis=1)
def predict(self, X):
"""Predict class labels for the given samples.
Args:
X (pandas.DataFrame): Test samples.
Returns:
numpy.ndarray: Predicted class label per sample.
"""
scores = self.decision_function(X)
if scores.ndim == 1:
indices = (scores > 0).astype(int)
else:
indices = scores.argmax(axis=1)
return self.classes_[indices]