""" The code for GridSearchReduction wraps the source class fairlearn.reductions.GridSearch available in the https://github.com/fairlearn/fairlearn library licensed under the MIT Licencse, Copyright Microsoft Corporation """ try: import fairlearn.reductions as red except ImportError as error: from logging import warning warning("{}: GridSearchReduction will be unavailable. To install, run:\n" "pip install 'aif360[Reductions]'".format(error)) from sklearn.base import BaseEstimator, ClassifierMixin, clone from sklearn.preprocessing import LabelEncoder class GridSearchReduction(BaseEstimator, ClassifierMixin): """Grid search reduction for fair classification or regression. Grid search is an in-processing technique that can be used for fair classification or fair regression. For classification it reduces fair classification to a sequence of cost-sensitive classification problems, returning the deterministic classifier with the lowest empirical error subject to fair classification constraints [#agarwal18]_ among the candidates searched. For regression it uses the same priniciple to return a deterministic regressor with the lowest empirical error subject to the constraint of bounded group loss [#agarwal19]_. References: .. [#agarwal18] `A. Agarwal, A. Beygelzimer, M. Dudik, J. Langford, and H. Wallach, "A Reductions Approach to Fair Classification," International Conference on Machine Learning, 2018. `_ .. [#agarwal19] `A. Agarwal, M. Dudik, and Z. Wu, "Fair Regression: Quantitative Definitions and Reduction-based Algorithms," International Conference on Machine Learning, 2019. `_ """ def __init__(self, prot_attr, estimator, constraints, constraint_weight=0.5, grid_size=10, grid_limit=2.0, grid=None, drop_prot_attr=True, loss="ZeroOne", min_val=None, max_val=None ): """ Args: prot_attr: String or array-like column indices or column names of protected attributes. estimator: An estimator implementing methods ``fit(X, y, sample_weight)`` and ``predict(X)``, where ``X`` is the matrix of features, ``y`` is the vector of labels, and ``sample_weight`` is a vector of weights; labels ``y`` and predictions returned by ``predict(X)`` are either 0 or 1 -- e.g. scikit-learn classifiers/regressors. constraints (str or fairlearn.reductions.Moment): If string, keyword denoting the :class:`fairlearn.reductions.Moment` object defining the disparity constraints -- e.g., "DemographicParity" or "EqualizedOdds". For a full list of possible options see `self.model.moments`. Otherwise, provide the desired :class:`~fairlearn.reductions.Moment` object defining the disparity constraints. constraint_weight: When the ``selection_rule`` is "tradeoff_optimization" (default, no other option currently) this float specifies the relative weight put on the constraint violation when selecting the best model. The weight placed on the error rate will be ``1-constraint_weight``. grid_size (int): The number of Lagrange multipliers to generate in the grid. grid_limit (float): The largest Lagrange multiplier to generate. The grid will contain values distributed between ``-grid_limit`` and ``grid_limit`` by default. grid (pandas.DataFrame): Instead of supplying a size and limit for the grid, users may specify the exact set of Lagrange multipliers they desire using this argument in a DataFrame. drop_prot_attr (bool): Flag indicating whether to drop protected attributes from training data. loss (str): String identifying loss function for constraints. Options include "ZeroOne", "Square", and "Absolute." min_val: Loss function parameter for "Square" and "Absolute," typically the minimum of the range of y values. max_val: Loss function parameter for "Square" and "Absolute," typically the maximum of the range of y values. """ self.prot_attr = prot_attr self.estimator = estimator self.constraints = constraints self.constraint_weight = constraint_weight self.grid_size = grid_size self.grid_limit = grid_limit self.grid = grid self.drop_prot_attr = drop_prot_attr self.loss = loss self.min_val = min_val self.max_val = max_val def fit(self, X, y): """Train a less biased classifier or regressor with the given training data. Args: X (pandas.DataFrame): Training samples. y (array-like): Training output. Returns: self """ self.estimator_ = clone(self.estimator) moments = { "DemographicParity": red.DemographicParity, "EqualizedOdds": red.EqualizedOdds, "TruePositiveRateParity": red.TruePositiveRateParity, "FalsePositiveRateParity": red.FalsePositiveRateParity, "ErrorRateParity": red.ErrorRateParity, "BoundedGroupLoss": red.BoundedGroupLoss, } if isinstance(self.constraints, str): if self.constraints not in moments: raise ValueError(f"Constraint not recognized: {self.constraints}") if self.constraints == "BoundedGroupLoss": losses = { "ZeroOne": red.ZeroOneLoss, "Square": red.SquareLoss, "Absolute": red.AbsoluteLoss } if self.loss == "ZeroOne": self.loss_ = losses[self.loss]() else: self.loss_ = losses[self.loss](self.min_val, self.max_val) self.moment_ = moments[self.constraints](loss=self.loss_) else: self.moment_ = moments[self.constraints]() elif isinstance(self.constraints, red.Moment): self.moment_ = self.constraints else: raise ValueError("constraints must be a string or Moment object.") self.model_ = red.GridSearch(estimator=self.estimator_, constraints=self.moment_, constraint_weight=self.constraint_weight, grid_size=self.grid_size, grid_limit=self.grid_limit, grid=self.grid) A = X[self.prot_attr] if self.drop_prot_attr: X = X.drop(self.prot_attr, axis=1) if isinstance(self.model_.constraints, red.ClassificationMoment): le = LabelEncoder() y = le.fit_transform(y) self.classes_ = le.classes_ self.model_.fit(X, y, sensitive_features=A) return self def predict(self, X): """Predict output for the given samples. Args: X (pandas.DataFrame): Test samples. Returns: numpy.ndarray: Predicted output per sample. """ if self.drop_prot_attr: X = X.drop(self.prot_attr, axis=1) return self.model_.predict(X) def predict_proba(self, X): """Probability estimates. The returned estimates for all classes are ordered by the label of classes for classification. Args: X (pandas.DataFrame): Test samples. Returns: numpy.ndarray: returns the probability of the sample for each class in the model, where classes are ordered as they are in ``self.classes_``. """ if self.drop_prot_attr: X = X.drop(self.prot_attr, axis=1) if isinstance(self.model_.constraints, red.ClassificationMoment): return self.model_.predict_proba(X) raise NotImplementedError("Underlying model does not support " "predict_proba")