""" Post-processing algorithms modify predictions to be more fair (predictions in, predictions out). """ from logging import warning import pandas as pd from sklearn.base import BaseEstimator, MetaEstimatorMixin, clone from sklearn.model_selection import train_test_split from sklearn.utils.metaestimators import if_delegate_has_method from aif360.sklearn.postprocessing.calibrated_equalized_odds import CalibratedEqualizedOdds from aif360.sklearn.postprocessing.reject_option_classification import RejectOptionClassifier, RejectOptionClassifierCV class PostProcessingMeta(BaseEstimator, MetaEstimatorMixin): """A meta-estimator which wraps a given estimator with a post-processing step. The post-processor trains on a separate training set from the estimator to prevent leakage. Note: Because of the dataset splitting, if a Pipeline is necessary it should be used as the input to this meta-estimator not the other way around. Attributes: estimator_: Fitted estimator. postprocessor_: Fitted postprocessor. classes_ (array, shape (n_classes,)): Class labels from `estimator_`. """ def __init__(self, estimator, postprocessor, *, prefit=False, val_size=0.25, **options): """ Args: estimator (sklearn.BaseEstimator): Original estimator. postprocessor: Post-processing algorithm. prefit (bool): If ``True``, it is assumed that estimator has been fitted already and all data is used to train postprocessor. val_size (int or float): Size of validation set used to fit the postprocessor. The estimator fits on the remainder of the training set. See :func:`~sklearn.model_selection.train_test_split` for details. **options: Keyword options passed through to :func:`~sklearn.model_selection.train_test_split`. Note: 'train_size' and 'test_size' will be ignored in favor of 'val_size'. """ self.estimator = estimator self.postprocessor = postprocessor self.prefit = prefit self.val_size = val_size self.options = options @property def _estimator_type(self): return self.postprocessor._estimator_type @property def classes_(self): """Class labels from the base estimator.""" return self.estimator_.classes_ def fit(self, X, y, sample_weight=None, **fit_params): """Splits the training samples with :func:`~sklearn.model_selection.train_test_split` and uses the resultant 'train' portion to train the estimator. Then the estimator predicts on the 'test' portion of the split data and the post-processor is trained with those prediction-ground-truth target pairs. Args: X (array-like): Training samples. y (pandas.Series): Training labels. sample_weight (array-like, optional): Sample weights. **fit_params: Parameters passed to the post-processor ``fit()`` method. Note: these do not need to be prefixed with ``__`` notation. Returns: self """ self.postprocessor_ = clone(self.postprocessor) self.estimator_ = self.estimator if self.prefit else clone(self.estimator) try: use_proba = self.postprocessor._get_tags()['requires_proba'] except KeyError: raise TypeError("`postprocessor` (type: {}) does not have a " "'requires_proba' tag.".format(type(self.estimator))) if use_proba and not hasattr(self.estimator, 'predict_proba'): raise TypeError("`estimator` (type: {}) does not implement method " "`predict_proba()`.".format(type(self.estimator))) if self.prefit: if len(self.options): warning("Splitting options were passed but prefit is True so " "these are ignored.") y_score = (self.estimator_.predict_proba(X) if use_proba else self.estimator_.predict(X)) y_score = pd.DataFrame(y_score, index=X.index).squeeze('columns') fit_params = fit_params.copy() fit_params.update(labels=self.estimator_.classes_) self.postprocessor_.fit(y_score, y, sample_weight=sample_weight, **fit_params) return self if 'train_size' in self.options or 'test_size' in self.options: warning("'train_size' and 'test_size' are ignored in favor of " "'val_size'") options_ = self.options.copy() options_['test_size'] = self.val_size if 'train_size' in options_: del options_['train_size'] if sample_weight is not None: X_est, X_post, y_est, y_post, sw_est, sw_post = train_test_split( X, y, sample_weight, **options_) self.estimator_.fit(X_est, y_est, sample_weight=sw_est) else: X_est, X_post, y_est, y_post = train_test_split(X, y, **options_) self.estimator_.fit(X_est, y_est) y_score = (self.estimator_.predict_proba(X_post) if use_proba else self.estimator_.predict(X_post)) y_score = pd.DataFrame(y_score, index=X_post.index).squeeze('columns') fit_params = fit_params.copy() fit_params.update(labels=self.estimator_.classes_) self.postprocessor_.fit(y_score, y_post, sample_weight=sw_post if sample_weight is not None else None, **fit_params) return self @if_delegate_has_method('postprocessor_') def predict(self, X): """Predict class labels for the given samples. First, runs ``self.estimator_.predict()`` (or ``predict_proba()`` if required) then returns the post-processed output from those predictions. Args: X (pandas.DataFrame): Test samples. Returns: numpy.ndarray: Predicted class label per sample. """ use_proba = self.postprocessor_._get_tags()['requires_proba'] y_score = (self.estimator_.predict_proba(X) if use_proba else self.estimator_.predict(X)) y_score = pd.DataFrame(y_score, index=X.index).squeeze('columns') return self.postprocessor_.predict(y_score) @if_delegate_has_method('postprocessor_') def predict_proba(self, X): """Probability estimates. First, runs ``self.estimator_.predict()`` (or ``predict_proba()`` if required) then returns the post-processed output from those predictions. The returned estimates for all classes are ordered by the label of classes. Args: X (pandas.DataFrame): Test samples. Returns: numpy.ndarray: Returns the probability of the sample for each class in the model, where classes are ordered as they are in ``self.classes_``. """ use_proba = self.postprocessor_._get_tags()['requires_proba'] y_score = (self.estimator_.predict_proba(X) if use_proba else self.estimator_.predict(X)) y_score = pd.DataFrame(y_score, index=X.index).squeeze('columns') return self.postprocessor_.predict_proba(y_score) @if_delegate_has_method('postprocessor_') def predict_log_proba(self, X): """Log of probability estimates. First, runs ``self.estimator_.predict()`` (or ``predict_proba()`` if required) then returns the post-processed output from those predictions. The returned estimates for all classes are ordered by the label of classes. Args: X (pandas.DataFrame): Test samples. Returns: array: Returns the log-probability of the sample for each class in the model, where classes are ordered as they are in ``self.classes_``. """ use_proba = self.postprocessor_._get_tags()['requires_proba'] y_score = (self.estimator_.predict_proba(X) if use_proba else self.estimator_.predict(X)) y_score = pd.DataFrame(y_score, index=X.index).squeeze('columns') return self.postprocessor_.predict_log_proba(y_score) @if_delegate_has_method('postprocessor_') def score(self, X, y, sample_weight=None): """Returns the output of the post-processor's score function on the given test data and labels. First, runs ``self.estimator_.predict()`` (or ``predict_proba()`` if required) then gets the post-processed output from those predictions and scores it. Args: X (pandas.DataFrame): Test samples. y (array-like): True labels for X. sample_weight (array-like, optional): Sample weights. Returns: float: Score value. """ use_proba = self.postprocessor_._get_tags()['requires_proba'] y_score = (self.estimator_.predict_proba(X) if use_proba else self.estimator_.predict(X)) y_score = pd.DataFrame(y_score, index=X.index).squeeze('columns') if sample_weight is None: return self.postprocessor_.score(y_score, y) return self.postprocessor_.score(y_score, y, sample_weight=sample_weight) __all__ = [ 'CalibratedEqualizedOdds', 'PostProcessingMeta', 'RejectOptionClassifier', 'RejectOptionClassifierCV' ]