Spaces:
Runtime error
Runtime error
import numpy as np | |
import scipy.special | |
from sklearn.base import BaseEstimator, ClassifierMixin | |
from sklearn.preprocessing import LabelEncoder | |
from sklearn.utils import check_random_state | |
from sklearn.utils.validation import check_is_fitted | |
try: | |
import tensorflow.compat.v1 as tf | |
except ImportError as error: | |
from logging import warning | |
warning("{}: AdversarialDebiasing will be unavailable. To install, run:\n" | |
"pip install 'aif360[AdversarialDebiasing]'".format(error)) | |
from aif360.sklearn.utils import check_inputs, check_groups | |
class AdversarialDebiasing(BaseEstimator, ClassifierMixin): | |
"""Debiasing with adversarial learning. | |
Adversarial debiasing is an in-processing technique that learns a | |
classifier to maximize prediction accuracy and simultaneously reduce an | |
adversary's ability to determine the protected attribute from the | |
predictions [#zhang18]_. This approach leads to a fair classifier as the | |
predictions cannot carry any group discrimination information that the | |
adversary can exploit. | |
References: | |
.. [#zhang18] `B. H. Zhang, B. Lemoine, and M. Mitchell, "Mitigating | |
Unwanted Biases with Adversarial Learning," AAAI/ACM Conference on | |
Artificial Intelligence, Ethics, and Society, 2018. | |
<https://dl.acm.org/citation.cfm?id=3278779>`_ | |
Attributes: | |
prot_attr_ (str or list(str)): Protected attribute(s) used for | |
debiasing. | |
groups_ (array, shape (n_groups,)): A list of group labels known to the | |
classifier. | |
classes_ (array, shape (n_classes,)): A list of class labels known to | |
the classifier. | |
sess_ (tensorflow.Session): The TensorFlow Session used for the | |
computations. Note: this can be manually closed to free up resources | |
with `self.sess_.close()`. | |
classifier_logits_ (tensorflow.Tensor): Tensor containing output logits | |
from the classifier. | |
adversary_logits_ (tensorflow.Tensor): Tensor containing output logits | |
from the adversary. | |
""" | |
def __init__(self, prot_attr=None, scope_name='classifier', | |
adversary_loss_weight=0.1, num_epochs=50, batch_size=128, | |
classifier_num_hidden_units=200, debias=True, verbose=False, | |
random_state=None): | |
r""" | |
Args: | |
prot_attr (single label or list-like, optional): Protected | |
attribute(s) to use in the debiasing process. If more than one | |
attribute, all combinations of values (intersections) are | |
considered. Default is ``None`` meaning all protected attributes | |
from the dataset are used. | |
scope_name (str, optional): TensorFlow "variable_scope" name for the | |
entire model (classifier and adversary). | |
adversary_loss_weight (float or ``None``, optional): If ``None``, | |
this will use the suggestion from the paper: | |
:math:`\alpha = \sqrt{global\_step}` with inverse time decay on | |
the learning rate. Otherwise, it uses the provided coefficient | |
with exponential learning rate decay. | |
num_epochs (int, optional): Number of epochs for which to train. | |
batch_size (int, optional): Size of mini-batch for training. | |
classifier_num_hidden_units (int, optional): Number of hidden units | |
in the classifier. | |
debias (bool, optional): If ``False``, learn a classifier without an | |
adversary. | |
verbose (bool, optional): If ``True``, print losses every 200 steps. | |
random_state (int or numpy.RandomState, optional): Seed of pseudo- | |
random number generator for shuffling data and seeding weights. | |
""" | |
self.prot_attr = prot_attr | |
self.scope_name = scope_name | |
self.adversary_loss_weight = adversary_loss_weight | |
self.num_epochs = num_epochs | |
self.batch_size = batch_size | |
self.classifier_num_hidden_units = classifier_num_hidden_units | |
self.debias = debias | |
self.verbose = verbose | |
self.random_state = random_state | |
def fit(self, X, y): | |
"""Train the classifier and adversary (if ``debias == True``) with the | |
given training data. | |
Args: | |
X (pandas.DataFrame): Training samples. | |
y (array-like): Training labels. | |
Returns: | |
self | |
""" | |
if tf.executing_eagerly(): | |
raise RuntimeError("AdversarialDebiasing does not work in eager " | |
"execution mode. To fix, add `tf.disable_eager_execution()`" | |
" to the top of the calling script.") | |
X, y, _ = check_inputs(X, y) | |
rng = check_random_state(self.random_state) | |
ii32 = np.iinfo(np.int32) | |
s1, s2, s3, s4 = rng.randint(ii32.min, ii32.max, size=4) | |
tf.reset_default_graph() | |
self.sess_ = tf.Session() | |
groups, self.prot_attr_ = check_groups(X, self.prot_attr) | |
le = LabelEncoder() | |
y = le.fit_transform(y) | |
self.classes_ = le.classes_ | |
# BUG: LabelEncoder converts to ndarray which removes tuple formatting | |
groups = groups.map(str) | |
groups = le.fit_transform(groups) | |
self.groups_ = le.classes_ | |
n_classes = len(self.classes_) | |
n_groups = len(self.groups_) | |
# use sigmoid for binary case | |
if n_classes == 2: | |
n_classes = 1 | |
if n_groups == 2: | |
n_groups = 1 | |
n_samples, n_features = X.shape | |
with tf.variable_scope(self.scope_name): | |
# Setup placeholders | |
self.input_ph = tf.placeholder(tf.float32, shape=[None, n_features]) | |
self.prot_attr_ph = tf.placeholder(tf.float32, shape=[None, 1]) | |
self.true_labels_ph = tf.placeholder(tf.float32, shape=[None, 1]) | |
self.keep_prob = tf.placeholder(tf.float32) | |
# Create classifier | |
with tf.variable_scope('classifier_model'): | |
W1 = tf.get_variable( | |
'W1', [n_features, self.classifier_num_hidden_units], | |
initializer=tf.initializers.glorot_uniform(seed=s1)) | |
b1 = tf.Variable(tf.zeros( | |
shape=[self.classifier_num_hidden_units]), name='b1') | |
h1 = tf.nn.relu(tf.matmul(self.input_ph, W1) + b1) | |
h1 = tf.nn.dropout(h1, rate=1-self.keep_prob, seed=s2) | |
W2 = tf.get_variable( | |
'W2', [self.classifier_num_hidden_units, n_classes], | |
initializer=tf.initializers.glorot_uniform(seed=s3)) | |
b2 = tf.Variable(tf.zeros(shape=[n_classes]), name='b2') | |
self.classifier_logits_ = tf.matmul(h1, W2) + b2 | |
# Obtain classifier loss | |
if self.classifier_logits_.shape[1] == 1: | |
clf_loss = tf.reduce_mean( | |
tf.nn.sigmoid_cross_entropy_with_logits( | |
labels=self.true_labels_ph, | |
logits=self.classifier_logits_)) | |
else: | |
clf_loss = tf.reduce_mean( | |
tf.nn.sparse_softmax_cross_entropy_with_logits( | |
labels=tf.squeeze(tf.cast(self.true_labels_ph, | |
tf.int32)), | |
logits=self.classifier_logits_)) | |
if self.debias: | |
# Create adversary | |
with tf.variable_scope("adversary_model"): | |
c = tf.get_variable('c', initializer=tf.constant(1.0)) | |
s = tf.sigmoid((1 + tf.abs(c)) * self.classifier_logits_) | |
W2 = tf.get_variable('W2', [3, n_groups], | |
initializer=tf.initializers.glorot_uniform(seed=s4)) | |
b2 = tf.Variable(tf.zeros(shape=[n_groups]), name='b2') | |
self.adversary_logits_ = tf.matmul( | |
tf.concat([s, s * self.true_labels_ph, | |
s * (1. - self.true_labels_ph)], axis=1), | |
W2) + b2 | |
# Obtain adversary loss | |
if self.adversary_logits_.shape[1] == 1: | |
adv_loss = tf.reduce_mean( | |
tf.nn.sigmoid_cross_entropy_with_logits( | |
labels=self.prot_attr_ph, | |
logits=self.adversary_logits_)) | |
else: | |
adv_loss = tf.reduce_mean( | |
tf.nn.sparse_softmax_cross_entropy_with_logits( | |
labels=tf.squeeze(tf.cast(self.prot_attr_ph, | |
tf.int32)), | |
logits=self.adversary_logits_)) | |
global_step = tf.Variable(0., trainable=False) | |
init_learning_rate = 0.001 | |
if self.adversary_loss_weight is not None: | |
learning_rate = tf.train.exponential_decay(init_learning_rate, | |
global_step, 1000, 0.96, staircase=True) | |
else: | |
learning_rate = tf.train.inverse_time_decay(init_learning_rate, | |
global_step, 1000, 0.1, staircase=True) | |
# Setup optimizers | |
clf_opt = tf.train.AdamOptimizer(learning_rate) | |
if self.debias: | |
adv_opt = tf.train.AdamOptimizer(learning_rate) | |
clf_vars = [var for var in tf.trainable_variables() | |
if 'classifier_model' in var.name] | |
if self.debias: | |
adv_vars = [var for var in tf.trainable_variables() | |
if 'adversary_model' in var.name] | |
# Compute grad wrt classifier parameters | |
adv_grads = {var: grad for (grad, var) in | |
adv_opt.compute_gradients(adv_loss, var_list=clf_vars)} | |
normalize = lambda x: x / (tf.norm(x) + np.finfo(np.float32).tiny) | |
clf_grads = [] | |
for (grad, var) in clf_opt.compute_gradients(clf_loss, | |
var_list=clf_vars): | |
if self.debias: | |
unit_adv_grad = normalize(adv_grads[var]) | |
# proj_{adv_grad} clf_grad: | |
grad -= tf.reduce_sum(grad * unit_adv_grad) * unit_adv_grad | |
if self.adversary_loss_weight is not None: | |
grad -= self.adversary_loss_weight * adv_grads[var] | |
else: | |
grad -= tf.sqrt(global_step) * adv_grads[var] | |
clf_grads.append((grad, var)) | |
clf_min = clf_opt.apply_gradients(clf_grads, | |
global_step=global_step) | |
if self.debias: | |
with tf.control_dependencies([clf_min]): | |
adv_min = adv_opt.minimize(adv_loss, var_list=adv_vars) | |
self.sess_.run(tf.global_variables_initializer()) | |
# Begin training | |
for epoch in range(self.num_epochs): | |
shuffled_ids = rng.permutation(n_samples) | |
for i in range(n_samples // self.batch_size): | |
batch_ids = shuffled_ids[self.batch_size * i: | |
self.batch_size * (i+1)] | |
batch_features = X.iloc[batch_ids] | |
batch_labels = y[batch_ids][:, np.newaxis] | |
batch_prot_attr = groups[batch_ids][:, np.newaxis] | |
batch_feed_dict = {self.input_ph: batch_features, | |
self.true_labels_ph: batch_labels, | |
self.prot_attr_ph: batch_prot_attr, | |
self.keep_prob: 0.8} | |
if self.debias: | |
_, _, clf_loss_val, adv_loss_val = self.sess_.run( | |
[clf_min, adv_min, clf_loss, adv_loss], | |
feed_dict=batch_feed_dict) | |
if i % 200 == 0 and self.verbose: | |
print("epoch {:>3d}; iter: {:>4d}; batch classifier" | |
" loss: {:.4f}; batch adversarial loss: " | |
"{:.4f}".format(epoch, i, clf_loss_val, | |
adv_loss_val)) | |
else: | |
_, clf_loss_val = self.sess_.run([clf_min, clf_loss], | |
feed_dict=batch_feed_dict) | |
if i % 200 == 0 and self.verbose: | |
print("epoch {:>3d}; iter: {:>4d}; batch classifier" | |
" loss: {:.4f}".format(epoch, i, | |
clf_loss_val)) | |
return self | |
def decision_function(self, X): | |
"""Soft prediction scores. | |
Args: | |
X (pandas.DataFrame): Test samples. | |
Returns: | |
numpy.ndarray: Confidence scores per (sample, class) combination. In | |
the binary case, confidence score for ``self.classes_[1]`` where >0 | |
means this class would be predicted. | |
""" | |
check_is_fitted(self, ['classes_', 'input_ph', 'keep_prob', | |
'classifier_logits_']) | |
n_samples = X.shape[0] | |
n_classes = len(self.classes_) | |
if n_classes == 2: | |
n_classes = 1 | |
samples_covered = 0 | |
scores = np.empty((n_samples, n_classes)) | |
while samples_covered < n_samples: | |
start = samples_covered | |
end = samples_covered + self.batch_size | |
if end > n_samples: | |
end = n_samples | |
batch_ids = np.arange(start, end) | |
batch_features = X.iloc[batch_ids] | |
batch_feed_dict = {self.input_ph: batch_features, | |
self.keep_prob: 1.0} | |
scores[batch_ids] = self.sess_.run(self.classifier_logits_, | |
feed_dict=batch_feed_dict) | |
samples_covered += len(batch_features) | |
return scores.ravel() if scores.shape[1] == 1 else scores | |
def predict_proba(self, X): | |
"""Probability estimates. | |
The returned estimates for all classes are ordered by the label of | |
classes. | |
Args: | |
X (pandas.DataFrame): Test samples. | |
Returns: | |
numpy.ndarray: Returns the probability of the sample for each class | |
in the model, where classes are ordered as they are in | |
``self.classes_``. | |
""" | |
decision = self.decision_function(X) | |
if decision.ndim == 1: | |
decision_2d = np.c_[np.zeros_like(decision), decision] | |
else: | |
decision_2d = decision | |
return scipy.special.softmax(decision_2d, axis=1) | |
def predict(self, X): | |
"""Predict class labels for the given samples. | |
Args: | |
X (pandas.DataFrame): Test samples. | |
Returns: | |
numpy.ndarray: Predicted class label per sample. | |
""" | |
scores = self.decision_function(X) | |
if scores.ndim == 1: | |
indices = (scores > 0).astype(int) | |
else: | |
indices = scores.argmax(axis=1) | |
return self.classes_[indices] | |