import numpy as np from sklearn.base import BaseEstimator, MetaEstimatorMixin, clone from sklearn.utils.metaestimators import if_delegate_has_method from sklearn.utils.validation import has_fit_parameter from aif360.sklearn.utils import check_inputs, check_groups class Reweighing(BaseEstimator): """Sample reweighing. Reweighing is a preprocessing technique that weights the examples in each (group, label) combination differently to ensure fairness before classification [#kamiran12]_. Note: This breaks the scikit-learn API by returning new sample weights from ``fit_transform()``. See :class:`ReweighingMeta` for a workaround. See also: :class:`ReweighingMeta` References: .. [#kamiran12] `F. Kamiran and T. Calders, "Data Preprocessing Techniques for Classification without Discrimination," Knowledge and Information Systems, 2012. `_ Attributes: prot_attr_ (str or list(str)): Protected attribute(s) used for reweighing. groups_ (array, shape (n_groups,)): A list of group labels known to the transformer. classes_ (array, shape (n_classes,)): A list of class labels known to the transformer. reweigh_factors_ (array, shape (n_groups, n_labels)): Reweighing factors for each combination of group and class labels used to debias samples. Existing sample weights are multiplied by the corresponding factor for that sample's group and class. """ def __init__(self, prot_attr=None): """ Args: prot_attr (single label or list-like, optional): Protected attribute(s) to use in the reweighing process. If more than one attribute, all combinations of values (intersections) are considered. Default is ``None`` meaning all protected attributes from the dataset are used. """ self.prot_attr = prot_attr def fit(self, X, y, sample_weight=None): """Only :meth:`fit_transform` is allowed for this algorithm.""" self.fit_transform(X, y, sample_weight=sample_weight) return self def fit_transform(self, X, y, sample_weight=None): """Compute the factors for reweighing the dataset and transform the sample weights. Args: X (pandas.DataFrame): Training samples. y (array-like): Training labels. sample_weight (array-like, optional): Sample weights. Returns: tuple: Samples and their weights. * **X** -- Unchanged samples. * **sample_weight** -- Transformed sample weights. """ X, y, sample_weight = check_inputs(X, y, sample_weight) sample_weight_t = np.empty_like(sample_weight) groups, self.prot_attr_ = check_groups(X, self.prot_attr) # TODO: maintain categorical ordering self.groups_ = np.unique(groups) self.classes_ = np.unique(y) n_groups = len(self.groups_) n_classes = len(self.classes_) self.reweigh_factors_ = np.full((n_groups, n_classes), np.nan) def N_(i): return sample_weight[i].sum() N = sample_weight.sum() for i, g in enumerate(self.groups_): for j, c in enumerate(self.classes_): g_and_c = (groups == g) & (y == c) if np.any(g_and_c): W_gc = N_(groups == g) * N_(y == c) / (N * N_(g_and_c)) sample_weight_t[g_and_c] = W_gc * sample_weight[g_and_c] self.reweigh_factors_[i, j] = W_gc return X, sample_weight_t class ReweighingMeta(BaseEstimator, MetaEstimatorMixin): """A meta-estimator which wraps a given estimator with a reweighing preprocessing step. This is necessary for use in a Pipeline, etc. Attributes: estimator_ (sklearn.BaseEstimator): The fitted underlying estimator. reweigher_: The fitted underlying reweigher. classes_ (array, shape (n_classes,)): Class labels from `estimator_`. """ def __init__(self, estimator, reweigher=None): """ Args: estimator (sklearn.BaseEstimator): Estimator to be wrapped. reweigher (optional): Preprocessor which returns new sample weights from ``transform()``. If ``None``, defaults to :class:`~aif360.sklearn.preprocessing.Reweighing`. """ self.reweigher = reweigher self.estimator = estimator @property def _estimator_type(self): return self.estimator._estimator_type @property def classes_(self): """Class labels from the base estimator.""" return self.estimator_.classes_ def fit(self, X, y, sample_weight=None): """Performs ``self.reweigher_.fit_transform(X, y, sample_weight)`` and then ``self.estimator_.fit(X, y, sample_weight)`` using the reweighed samples. Args: X (pandas.DataFrame): Training samples. y (array-like): Training labels. sample_weight (array-like, optional): Sample weights. Returns: self """ if not has_fit_parameter(self.estimator, 'sample_weight'): raise TypeError("`estimator` (type: {}) does not have fit parameter" " `sample_weight`.".format(type(self.estimator))) if self.reweigher is None: self.reweigher_ = Reweighing() else: self.reweigher_ = clone(self.reweigher) self.estimator_ = clone(self.estimator) X, sample_weight = self.reweigher_.fit_transform(X, y, sample_weight=sample_weight) self.estimator_.fit(X, y, sample_weight=sample_weight) return self @if_delegate_has_method('estimator_') def predict(self, X): """Predict class labels for the given samples using ``self.estimator_``. Args: X (array-like): Test samples. Returns: array: Predicted class label per sample. """ return self.estimator_.predict(X) @if_delegate_has_method('estimator_') def predict_proba(self, X): """Probability estimates from ``self.estimator_``. The returned estimates for all classes are ordered by the label of classes. Args: X (array-like): Test samples. Returns: array: Returns the probability of the sample for each class in the model, where classes are ordered as they are in ``self.classes_``. """ return self.estimator_.predict_proba(X) @if_delegate_has_method('estimator_') def predict_log_proba(self, X): """Log of probability estimates from ``self.estimator_``. The returned estimates for all classes are ordered by the label of classes. Args: X (array-like): Test samples. Returns: array: Returns the log-probability of the sample for each class in the model, where classes are ordered as they are in ``self.classes_``. """ return self.estimator_.predict_log_proba(X) @if_delegate_has_method('estimator_') def score(self, X, y, sample_weight=None): """Returns the output of the estimator's score function on the given test data and labels. Args: X (array-like): Test samples. y (array-like): True labels for X. sample_weight (array-like, optional): Sample weights. Returns: float: `self.estimator.score(X, y, sample_weight)` """ return self.estimator_.score(X, y, sample_weight=sample_weight)