FairUP / src /aif360 /sklearn /inprocessing /grid_search_reduction.py
erasmopurif's picture
First commit
d2a8669
"""
The code for GridSearchReduction wraps the source class
fairlearn.reductions.GridSearch
available in the https://github.com/fairlearn/fairlearn library
licensed under the MIT Licencse, Copyright Microsoft Corporation
"""
try:
import fairlearn.reductions as red
except ImportError as error:
from logging import warning
warning("{}: GridSearchReduction will be unavailable. To install, run:\n"
"pip install 'aif360[Reductions]'".format(error))
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.preprocessing import LabelEncoder
class GridSearchReduction(BaseEstimator, ClassifierMixin):
"""Grid search reduction for fair classification or regression.
Grid search is an in-processing technique that can be used for fair
classification or fair regression. For classification it reduces fair
classification to a sequence of cost-sensitive classification problems,
returning the deterministic classifier with the lowest empirical error
subject to fair classification constraints [#agarwal18]_ among the
candidates searched. For regression it uses the same priniciple to return a
deterministic regressor with the lowest empirical error subject to the
constraint of bounded group loss [#agarwal19]_.
References:
.. [#agarwal18] `A. Agarwal, A. Beygelzimer, M. Dudik, J. Langford, and
H. Wallach, "A Reductions Approach to Fair Classification,"
International Conference on Machine Learning, 2018.
<https://arxiv.org/abs/1803.02453>`_
.. [#agarwal19] `A. Agarwal, M. Dudik, and Z. Wu, "Fair Regression:
Quantitative Definitions and Reduction-based Algorithms,"
International Conference on Machine Learning, 2019.
<https://arxiv.org/abs/1905.12843>`_
"""
def __init__(self,
prot_attr,
estimator,
constraints,
constraint_weight=0.5,
grid_size=10,
grid_limit=2.0,
grid=None,
drop_prot_attr=True,
loss="ZeroOne",
min_val=None,
max_val=None
):
"""
Args:
prot_attr: String or array-like column indices or column names
of protected attributes.
estimator: An estimator implementing methods ``fit(X, y,
sample_weight)`` and ``predict(X)``, where ``X`` is the matrix
of features, ``y`` is the vector of labels, and
``sample_weight`` is a vector of weights; labels ``y`` and
predictions returned by ``predict(X)`` are either 0 or 1 -- e.g.
scikit-learn classifiers/regressors.
constraints (str or fairlearn.reductions.Moment): If string, keyword
denoting the :class:`fairlearn.reductions.Moment` object
defining the disparity constraints -- e.g., "DemographicParity"
or "EqualizedOdds". For a full list of possible options see
`self.model.moments`. Otherwise, provide the desired
:class:`~fairlearn.reductions.Moment` object defining the
disparity constraints.
constraint_weight: When the ``selection_rule`` is
"tradeoff_optimization" (default, no other option currently)
this float specifies the relative weight put on the constraint
violation when selecting the best model. The weight placed on
the error rate will be ``1-constraint_weight``.
grid_size (int): The number of Lagrange multipliers to generate in
the grid.
grid_limit (float): The largest Lagrange multiplier to generate. The
grid will contain values distributed between ``-grid_limit`` and
``grid_limit`` by default.
grid (pandas.DataFrame): Instead of supplying a size and limit for
the grid, users may specify the exact set of Lagrange
multipliers they desire using this argument in a DataFrame.
drop_prot_attr (bool): Flag indicating whether to drop protected
attributes from training data.
loss (str): String identifying loss function for constraints.
Options include "ZeroOne", "Square", and "Absolute."
min_val: Loss function parameter for "Square" and "Absolute,"
typically the minimum of the range of y values.
max_val: Loss function parameter for "Square" and "Absolute,"
typically the maximum of the range of y values.
"""
self.prot_attr = prot_attr
self.estimator = estimator
self.constraints = constraints
self.constraint_weight = constraint_weight
self.grid_size = grid_size
self.grid_limit = grid_limit
self.grid = grid
self.drop_prot_attr = drop_prot_attr
self.loss = loss
self.min_val = min_val
self.max_val = max_val
def fit(self, X, y):
"""Train a less biased classifier or regressor with the given training
data.
Args:
X (pandas.DataFrame): Training samples.
y (array-like): Training output.
Returns:
self
"""
self.estimator_ = clone(self.estimator)
moments = {
"DemographicParity": red.DemographicParity,
"EqualizedOdds": red.EqualizedOdds,
"TruePositiveRateParity": red.TruePositiveRateParity,
"FalsePositiveRateParity": red.FalsePositiveRateParity,
"ErrorRateParity": red.ErrorRateParity,
"BoundedGroupLoss": red.BoundedGroupLoss,
}
if isinstance(self.constraints, str):
if self.constraints not in moments:
raise ValueError(f"Constraint not recognized: {self.constraints}")
if self.constraints == "BoundedGroupLoss":
losses = {
"ZeroOne": red.ZeroOneLoss,
"Square": red.SquareLoss,
"Absolute": red.AbsoluteLoss
}
if self.loss == "ZeroOne":
self.loss_ = losses[self.loss]()
else:
self.loss_ = losses[self.loss](self.min_val, self.max_val)
self.moment_ = moments[self.constraints](loss=self.loss_)
else:
self.moment_ = moments[self.constraints]()
elif isinstance(self.constraints, red.Moment):
self.moment_ = self.constraints
else:
raise ValueError("constraints must be a string or Moment object.")
self.model_ = red.GridSearch(estimator=self.estimator_,
constraints=self.moment_,
constraint_weight=self.constraint_weight,
grid_size=self.grid_size, grid_limit=self.grid_limit,
grid=self.grid)
A = X[self.prot_attr]
if self.drop_prot_attr:
X = X.drop(self.prot_attr, axis=1)
if isinstance(self.model_.constraints, red.ClassificationMoment):
le = LabelEncoder()
y = le.fit_transform(y)
self.classes_ = le.classes_
self.model_.fit(X, y, sensitive_features=A)
return self
def predict(self, X):
"""Predict output for the given samples.
Args:
X (pandas.DataFrame): Test samples.
Returns:
numpy.ndarray: Predicted output per sample.
"""
if self.drop_prot_attr:
X = X.drop(self.prot_attr, axis=1)
return self.model_.predict(X)
def predict_proba(self, X):
"""Probability estimates.
The returned estimates for all classes are ordered by the label of
classes for classification.
Args:
X (pandas.DataFrame): Test samples.
Returns:
numpy.ndarray: returns the probability of the sample for each class
in the model, where classes are ordered as they are in
``self.classes_``.
"""
if self.drop_prot_attr:
X = X.drop(self.prot_attr, axis=1)
if isinstance(self.model_.constraints, red.ClassificationMoment):
return self.model_.predict_proba(X)
raise NotImplementedError("Underlying model does not support "
"predict_proba")