Spaces:
Runtime error
Runtime error
File size: 8,519 Bytes
d2a8669 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 |
"""
The code for GridSearchReduction wraps the source class
fairlearn.reductions.GridSearch
available in the https://github.com/fairlearn/fairlearn library
licensed under the MIT Licencse, Copyright Microsoft Corporation
"""
try:
import fairlearn.reductions as red
except ImportError as error:
from logging import warning
warning("{}: GridSearchReduction will be unavailable. To install, run:\n"
"pip install 'aif360[Reductions]'".format(error))
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.preprocessing import LabelEncoder
class GridSearchReduction(BaseEstimator, ClassifierMixin):
"""Grid search reduction for fair classification or regression.
Grid search is an in-processing technique that can be used for fair
classification or fair regression. For classification it reduces fair
classification to a sequence of cost-sensitive classification problems,
returning the deterministic classifier with the lowest empirical error
subject to fair classification constraints [#agarwal18]_ among the
candidates searched. For regression it uses the same priniciple to return a
deterministic regressor with the lowest empirical error subject to the
constraint of bounded group loss [#agarwal19]_.
References:
.. [#agarwal18] `A. Agarwal, A. Beygelzimer, M. Dudik, J. Langford, and
H. Wallach, "A Reductions Approach to Fair Classification,"
International Conference on Machine Learning, 2018.
<https://arxiv.org/abs/1803.02453>`_
.. [#agarwal19] `A. Agarwal, M. Dudik, and Z. Wu, "Fair Regression:
Quantitative Definitions and Reduction-based Algorithms,"
International Conference on Machine Learning, 2019.
<https://arxiv.org/abs/1905.12843>`_
"""
def __init__(self,
prot_attr,
estimator,
constraints,
constraint_weight=0.5,
grid_size=10,
grid_limit=2.0,
grid=None,
drop_prot_attr=True,
loss="ZeroOne",
min_val=None,
max_val=None
):
"""
Args:
prot_attr: String or array-like column indices or column names
of protected attributes.
estimator: An estimator implementing methods ``fit(X, y,
sample_weight)`` and ``predict(X)``, where ``X`` is the matrix
of features, ``y`` is the vector of labels, and
``sample_weight`` is a vector of weights; labels ``y`` and
predictions returned by ``predict(X)`` are either 0 or 1 -- e.g.
scikit-learn classifiers/regressors.
constraints (str or fairlearn.reductions.Moment): If string, keyword
denoting the :class:`fairlearn.reductions.Moment` object
defining the disparity constraints -- e.g., "DemographicParity"
or "EqualizedOdds". For a full list of possible options see
`self.model.moments`. Otherwise, provide the desired
:class:`~fairlearn.reductions.Moment` object defining the
disparity constraints.
constraint_weight: When the ``selection_rule`` is
"tradeoff_optimization" (default, no other option currently)
this float specifies the relative weight put on the constraint
violation when selecting the best model. The weight placed on
the error rate will be ``1-constraint_weight``.
grid_size (int): The number of Lagrange multipliers to generate in
the grid.
grid_limit (float): The largest Lagrange multiplier to generate. The
grid will contain values distributed between ``-grid_limit`` and
``grid_limit`` by default.
grid (pandas.DataFrame): Instead of supplying a size and limit for
the grid, users may specify the exact set of Lagrange
multipliers they desire using this argument in a DataFrame.
drop_prot_attr (bool): Flag indicating whether to drop protected
attributes from training data.
loss (str): String identifying loss function for constraints.
Options include "ZeroOne", "Square", and "Absolute."
min_val: Loss function parameter for "Square" and "Absolute,"
typically the minimum of the range of y values.
max_val: Loss function parameter for "Square" and "Absolute,"
typically the maximum of the range of y values.
"""
self.prot_attr = prot_attr
self.estimator = estimator
self.constraints = constraints
self.constraint_weight = constraint_weight
self.grid_size = grid_size
self.grid_limit = grid_limit
self.grid = grid
self.drop_prot_attr = drop_prot_attr
self.loss = loss
self.min_val = min_val
self.max_val = max_val
def fit(self, X, y):
"""Train a less biased classifier or regressor with the given training
data.
Args:
X (pandas.DataFrame): Training samples.
y (array-like): Training output.
Returns:
self
"""
self.estimator_ = clone(self.estimator)
moments = {
"DemographicParity": red.DemographicParity,
"EqualizedOdds": red.EqualizedOdds,
"TruePositiveRateParity": red.TruePositiveRateParity,
"FalsePositiveRateParity": red.FalsePositiveRateParity,
"ErrorRateParity": red.ErrorRateParity,
"BoundedGroupLoss": red.BoundedGroupLoss,
}
if isinstance(self.constraints, str):
if self.constraints not in moments:
raise ValueError(f"Constraint not recognized: {self.constraints}")
if self.constraints == "BoundedGroupLoss":
losses = {
"ZeroOne": red.ZeroOneLoss,
"Square": red.SquareLoss,
"Absolute": red.AbsoluteLoss
}
if self.loss == "ZeroOne":
self.loss_ = losses[self.loss]()
else:
self.loss_ = losses[self.loss](self.min_val, self.max_val)
self.moment_ = moments[self.constraints](loss=self.loss_)
else:
self.moment_ = moments[self.constraints]()
elif isinstance(self.constraints, red.Moment):
self.moment_ = self.constraints
else:
raise ValueError("constraints must be a string or Moment object.")
self.model_ = red.GridSearch(estimator=self.estimator_,
constraints=self.moment_,
constraint_weight=self.constraint_weight,
grid_size=self.grid_size, grid_limit=self.grid_limit,
grid=self.grid)
A = X[self.prot_attr]
if self.drop_prot_attr:
X = X.drop(self.prot_attr, axis=1)
if isinstance(self.model_.constraints, red.ClassificationMoment):
le = LabelEncoder()
y = le.fit_transform(y)
self.classes_ = le.classes_
self.model_.fit(X, y, sensitive_features=A)
return self
def predict(self, X):
"""Predict output for the given samples.
Args:
X (pandas.DataFrame): Test samples.
Returns:
numpy.ndarray: Predicted output per sample.
"""
if self.drop_prot_attr:
X = X.drop(self.prot_attr, axis=1)
return self.model_.predict(X)
def predict_proba(self, X):
"""Probability estimates.
The returned estimates for all classes are ordered by the label of
classes for classification.
Args:
X (pandas.DataFrame): Test samples.
Returns:
numpy.ndarray: returns the probability of the sample for each class
in the model, where classes are ordered as they are in
``self.classes_``.
"""
if self.drop_prot_attr:
X = X.drop(self.prot_attr, axis=1)
if isinstance(self.model_.constraints, red.ClassificationMoment):
return self.model_.predict_proba(X)
raise NotImplementedError("Underlying model does not support "
"predict_proba")
|