FairUP / src /aif360 /algorithms /inprocessing /adversarial_debiasing.py
erasmopurif's picture
First commit
d2a8669
import numpy as np
try:
import tensorflow.compat.v1 as tf
except ImportError as error:
from logging import warning
warning("{}: AdversarialDebiasing will be unavailable. To install, run:\n"
"pip install 'aif360[AdversarialDebiasing]'".format(error))
from aif360.algorithms import Transformer
class AdversarialDebiasing(Transformer):
"""Adversarial debiasing is an in-processing technique that learns a
classifier to maximize prediction accuracy and simultaneously reduce an
adversary's ability to determine the protected attribute from the
predictions [5]_. This approach leads to a fair classifier as the
predictions cannot carry any group discrimination information that the
adversary can exploit.
References:
.. [5] B. H. Zhang, B. Lemoine, and M. Mitchell, "Mitigating Unwanted
Biases with Adversarial Learning," AAAI/ACM Conference on Artificial
Intelligence, Ethics, and Society, 2018.
"""
def __init__(self,
unprivileged_groups,
privileged_groups,
scope_name,
sess,
seed=None,
adversary_loss_weight=0.1,
num_epochs=50,
batch_size=128,
classifier_num_hidden_units=200,
debias=True):
"""
Args:
unprivileged_groups (tuple): Representation for unprivileged groups
privileged_groups (tuple): Representation for privileged groups
scope_name (str): scope name for the tenforflow variables
sess (tf.Session): tensorflow session
seed (int, optional): Seed to make `predict` repeatable.
adversary_loss_weight (float, optional): Hyperparameter that chooses
the strength of the adversarial loss.
num_epochs (int, optional): Number of training epochs.
batch_size (int, optional): Batch size.
classifier_num_hidden_units (int, optional): Number of hidden units
in the classifier model.
debias (bool, optional): Learn a classifier with or without
debiasing.
"""
super(AdversarialDebiasing, self).__init__(
unprivileged_groups=unprivileged_groups,
privileged_groups=privileged_groups)
self.scope_name = scope_name
self.seed = seed
self.unprivileged_groups = unprivileged_groups
self.privileged_groups = privileged_groups
if len(self.unprivileged_groups) > 1 or len(self.privileged_groups) > 1:
raise ValueError("Only one unprivileged_group or privileged_group supported.")
self.protected_attribute_name = list(self.unprivileged_groups[0].keys())[0]
self.sess = sess
self.adversary_loss_weight = adversary_loss_weight
self.num_epochs = num_epochs
self.batch_size = batch_size
self.classifier_num_hidden_units = classifier_num_hidden_units
self.debias = debias
self.features_dim = None
self.features_ph = None
self.protected_attributes_ph = None
self.true_labels_ph = None
self.pred_labels = None
def _classifier_model(self, features, features_dim, keep_prob):
"""Compute the classifier predictions for the outcome variable.
"""
with tf.variable_scope("classifier_model"):
W1 = tf.get_variable('W1', [features_dim, self.classifier_num_hidden_units],
initializer=tf.initializers.glorot_uniform(seed=self.seed1))
b1 = tf.Variable(tf.zeros(shape=[self.classifier_num_hidden_units]), name='b1')
h1 = tf.nn.relu(tf.matmul(features, W1) + b1)
h1 = tf.nn.dropout(h1, keep_prob=keep_prob, seed=self.seed2)
W2 = tf.get_variable('W2', [self.classifier_num_hidden_units, 1],
initializer=tf.initializers.glorot_uniform(seed=self.seed3))
b2 = tf.Variable(tf.zeros(shape=[1]), name='b2')
pred_logit = tf.matmul(h1, W2) + b2
pred_label = tf.sigmoid(pred_logit)
return pred_label, pred_logit
def _adversary_model(self, pred_logits, true_labels):
"""Compute the adversary predictions for the protected attribute.
"""
with tf.variable_scope("adversary_model"):
c = tf.get_variable('c', initializer=tf.constant(1.0))
s = tf.sigmoid((1 + tf.abs(c)) * pred_logits)
W2 = tf.get_variable('W2', [3, 1],
initializer=tf.initializers.glorot_uniform(seed=self.seed4))
b2 = tf.Variable(tf.zeros(shape=[1]), name='b2')
pred_protected_attribute_logit = tf.matmul(tf.concat([s, s * true_labels, s * (1.0 - true_labels)], axis=1), W2) + b2
pred_protected_attribute_label = tf.sigmoid(pred_protected_attribute_logit)
return pred_protected_attribute_label, pred_protected_attribute_logit
def fit(self, dataset):
"""Compute the model parameters of the fair classifier using gradient
descent.
Args:
dataset (BinaryLabelDataset): Dataset containing true labels.
Returns:
AdversarialDebiasing: Returns self.
"""
if tf.executing_eagerly():
raise RuntimeError("AdversarialDebiasing does not work in eager "
"execution mode. To fix, add `tf.disable_eager_execution()`"
" to the top of the calling script.")
if self.seed is not None:
np.random.seed(self.seed)
ii32 = np.iinfo(np.int32)
self.seed1, self.seed2, self.seed3, self.seed4 = np.random.randint(ii32.min, ii32.max, size=4)
# Map the dataset labels to 0 and 1.
temp_labels = dataset.labels.copy()
temp_labels[(dataset.labels == dataset.favorable_label).ravel(),0] = 1.0
temp_labels[(dataset.labels == dataset.unfavorable_label).ravel(),0] = 0.0
with tf.variable_scope(self.scope_name):
num_train_samples, self.features_dim = np.shape(dataset.features)
# Setup placeholders
self.features_ph = tf.placeholder(tf.float32, shape=[None, self.features_dim])
self.protected_attributes_ph = tf.placeholder(tf.float32, shape=[None,1])
self.true_labels_ph = tf.placeholder(tf.float32, shape=[None,1])
self.keep_prob = tf.placeholder(tf.float32)
# Obtain classifier predictions and classifier loss
self.pred_labels, pred_logits = self._classifier_model(self.features_ph, self.features_dim, self.keep_prob)
pred_labels_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=self.true_labels_ph, logits=pred_logits))
if self.debias:
# Obtain adversary predictions and adversary loss
pred_protected_attributes_labels, pred_protected_attributes_logits = self._adversary_model(pred_logits, self.true_labels_ph)
pred_protected_attributes_loss = tf.reduce_mean(
tf.nn.sigmoid_cross_entropy_with_logits(labels=self.protected_attributes_ph, logits=pred_protected_attributes_logits))
# Setup optimizers with learning rates
global_step = tf.Variable(0, trainable=False)
starter_learning_rate = 0.001
learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
1000, 0.96, staircase=True)
classifier_opt = tf.train.AdamOptimizer(learning_rate)
if self.debias:
adversary_opt = tf.train.AdamOptimizer(learning_rate)
classifier_vars = [var for var in tf.trainable_variables(scope=self.scope_name) if 'classifier_model' in var.name]
if self.debias:
adversary_vars = [var for var in tf.trainable_variables(scope=self.scope_name) if 'adversary_model' in var.name]
# Update classifier parameters
adversary_grads = {var: grad for (grad, var) in adversary_opt.compute_gradients(pred_protected_attributes_loss,
var_list=classifier_vars)}
normalize = lambda x: x / (tf.norm(x) + np.finfo(np.float32).tiny)
classifier_grads = []
for (grad,var) in classifier_opt.compute_gradients(pred_labels_loss, var_list=classifier_vars):
if self.debias:
unit_adversary_grad = normalize(adversary_grads[var])
grad -= tf.reduce_sum(grad * unit_adversary_grad) * unit_adversary_grad
grad -= self.adversary_loss_weight * adversary_grads[var]
classifier_grads.append((grad, var))
classifier_minimizer = classifier_opt.apply_gradients(classifier_grads, global_step=global_step)
if self.debias:
# Update adversary parameters
with tf.control_dependencies([classifier_minimizer]):
adversary_minimizer = adversary_opt.minimize(pred_protected_attributes_loss, var_list=adversary_vars)#, global_step=global_step)
self.sess.run(tf.global_variables_initializer())
self.sess.run(tf.local_variables_initializer())
# Begin training
for epoch in range(self.num_epochs):
shuffled_ids = np.random.choice(num_train_samples, num_train_samples, replace=False)
for i in range(num_train_samples//self.batch_size):
batch_ids = shuffled_ids[self.batch_size*i: self.batch_size*(i+1)]
batch_features = dataset.features[batch_ids]
batch_labels = np.reshape(temp_labels[batch_ids], [-1,1])
batch_protected_attributes = np.reshape(dataset.protected_attributes[batch_ids][:,
dataset.protected_attribute_names.index(self.protected_attribute_name)], [-1,1])
batch_feed_dict = {self.features_ph: batch_features,
self.true_labels_ph: batch_labels,
self.protected_attributes_ph: batch_protected_attributes,
self.keep_prob: 0.8}
if self.debias:
_, _, pred_labels_loss_value, pred_protected_attributes_loss_vale = self.sess.run([classifier_minimizer,
adversary_minimizer,
pred_labels_loss,
pred_protected_attributes_loss], feed_dict=batch_feed_dict)
if i % 200 == 0:
print("epoch %d; iter: %d; batch classifier loss: %f; batch adversarial loss: %f" % (epoch, i, pred_labels_loss_value,
pred_protected_attributes_loss_vale))
else:
_, pred_labels_loss_value = self.sess.run(
[classifier_minimizer,
pred_labels_loss], feed_dict=batch_feed_dict)
if i % 200 == 0:
print("epoch %d; iter: %d; batch classifier loss: %f" % (
epoch, i, pred_labels_loss_value))
return self
def predict(self, dataset):
"""Obtain the predictions for the provided dataset using the fair
classifier learned.
Args:
dataset (BinaryLabelDataset): Dataset containing labels that needs
to be transformed.
Returns:
dataset (BinaryLabelDataset): Transformed dataset.
"""
if self.seed is not None:
np.random.seed(self.seed)
num_test_samples, _ = np.shape(dataset.features)
samples_covered = 0
pred_labels = []
while samples_covered < num_test_samples:
start = samples_covered
end = samples_covered + self.batch_size
if end > num_test_samples:
end = num_test_samples
batch_ids = np.arange(start, end)
batch_features = dataset.features[batch_ids]
batch_labels = np.reshape(dataset.labels[batch_ids], [-1,1])
batch_protected_attributes = np.reshape(dataset.protected_attributes[batch_ids][:,
dataset.protected_attribute_names.index(self.protected_attribute_name)], [-1,1])
batch_feed_dict = {self.features_ph: batch_features,
self.true_labels_ph: batch_labels,
self.protected_attributes_ph: batch_protected_attributes,
self.keep_prob: 1.0}
pred_labels += self.sess.run(self.pred_labels, feed_dict=batch_feed_dict)[:,0].tolist()
samples_covered += len(batch_features)
# Mutated, fairer dataset with new labels
dataset_new = dataset.copy(deepcopy = True)
dataset_new.scores = np.array(pred_labels, dtype=np.float64).reshape(-1, 1)
dataset_new.labels = (np.array(pred_labels)>0.5).astype(np.float64).reshape(-1,1)
# Map the dataset labels to back to their original values.
temp_labels = dataset_new.labels.copy()
temp_labels[(dataset_new.labels == 1.0).ravel(), 0] = dataset.favorable_label
temp_labels[(dataset_new.labels == 0.0).ravel(), 0] = dataset.unfavorable_label
dataset_new.labels = temp_labels.copy()
return dataset_new