erasmopurif's picture
First commit
import numpy as np
from sklearn.base import BaseEstimator, MetaEstimatorMixin, clone
from sklearn.utils.metaestimators import if_delegate_has_method
from sklearn.utils.validation import has_fit_parameter
from aif360.sklearn.utils import check_inputs, check_groups
class Reweighing(BaseEstimator):
"""Sample reweighing.
Reweighing is a preprocessing technique that weights the examples in each
(group, label) combination differently to ensure fairness before
classification [#kamiran12]_.
This breaks the scikit-learn API by returning new sample weights from
``fit_transform()``. See :class:`ReweighingMeta` for a workaround.
See also:
.. [#kamiran12] `F. Kamiran and T. Calders, "Data Preprocessing
Techniques for Classification without Discrimination," Knowledge and
Information Systems, 2012.
prot_attr_ (str or list(str)): Protected attribute(s) used for
groups_ (array, shape (n_groups,)): A list of group labels known to the
classes_ (array, shape (n_classes,)): A list of class labels known to
the transformer.
reweigh_factors_ (array, shape (n_groups, n_labels)): Reweighing factors
for each combination of group and class labels used to debias
samples. Existing sample weights are multiplied by the corresponding
factor for that sample's group and class.
def __init__(self, prot_attr=None):
prot_attr (single label or list-like, optional): Protected
attribute(s) to use in the reweighing process. If more than one
attribute, all combinations of values (intersections) are
considered. Default is ``None`` meaning all protected attributes
from the dataset are used.
self.prot_attr = prot_attr
def fit(self, X, y, sample_weight=None):
"""Only :meth:`fit_transform` is allowed for this algorithm."""
self.fit_transform(X, y, sample_weight=sample_weight)
return self
def fit_transform(self, X, y, sample_weight=None):
"""Compute the factors for reweighing the dataset and transform the
sample weights.
X (pandas.DataFrame): Training samples.
y (array-like): Training labels.
sample_weight (array-like, optional): Sample weights.
Samples and their weights.
* **X** -- Unchanged samples.
* **sample_weight** -- Transformed sample weights.
X, y, sample_weight = check_inputs(X, y, sample_weight)
sample_weight_t = np.empty_like(sample_weight)
groups, self.prot_attr_ = check_groups(X, self.prot_attr)
# TODO: maintain categorical ordering
self.groups_ = np.unique(groups)
self.classes_ = np.unique(y)
n_groups = len(self.groups_)
n_classes = len(self.classes_)
self.reweigh_factors_ = np.full((n_groups, n_classes), np.nan)
def N_(i): return sample_weight[i].sum()
N = sample_weight.sum()
for i, g in enumerate(self.groups_):
for j, c in enumerate(self.classes_):
g_and_c = (groups == g) & (y == c)
if np.any(g_and_c):
W_gc = N_(groups == g) * N_(y == c) / (N * N_(g_and_c))
sample_weight_t[g_and_c] = W_gc * sample_weight[g_and_c]
self.reweigh_factors_[i, j] = W_gc
return X, sample_weight_t
class ReweighingMeta(BaseEstimator, MetaEstimatorMixin):
"""A meta-estimator which wraps a given estimator with a reweighing
preprocessing step.
This is necessary for use in a Pipeline, etc.
estimator_ (sklearn.BaseEstimator): The fitted underlying estimator.
reweigher_: The fitted underlying reweigher.
classes_ (array, shape (n_classes,)): Class labels from `estimator_`.
def __init__(self, estimator, reweigher=None):
estimator (sklearn.BaseEstimator): Estimator to be wrapped.
reweigher (optional): Preprocessor which returns new sample weights
from ``transform()``. If ``None``, defaults to
self.reweigher = reweigher
self.estimator = estimator
def _estimator_type(self):
return self.estimator._estimator_type
def classes_(self):
"""Class labels from the base estimator."""
return self.estimator_.classes_
def fit(self, X, y, sample_weight=None):
"""Performs ``self.reweigher_.fit_transform(X, y, sample_weight)`` and
then ``, y, sample_weight)`` using the reweighed
X (pandas.DataFrame): Training samples.
y (array-like): Training labels.
sample_weight (array-like, optional): Sample weights.
if not has_fit_parameter(self.estimator, 'sample_weight'):
raise TypeError("`estimator` (type: {}) does not have fit parameter"
" `sample_weight`.".format(type(self.estimator)))
if self.reweigher is None:
self.reweigher_ = Reweighing()
self.reweigher_ = clone(self.reweigher)
self.estimator_ = clone(self.estimator)
X, sample_weight = self.reweigher_.fit_transform(X, y,
sample_weight=sample_weight), y, sample_weight=sample_weight)
return self
def predict(self, X):
"""Predict class labels for the given samples using ``self.estimator_``.
X (array-like): Test samples.
array: Predicted class label per sample.
return self.estimator_.predict(X)
def predict_proba(self, X):
"""Probability estimates from ``self.estimator_``.
The returned estimates for all classes are ordered by the label of
X (array-like): Test samples.
array: Returns the probability of the sample for each class in the
model, where classes are ordered as they are in ``self.classes_``.
return self.estimator_.predict_proba(X)
def predict_log_proba(self, X):
"""Log of probability estimates from ``self.estimator_``.
The returned estimates for all classes are ordered by the label of
X (array-like): Test samples.
array: Returns the log-probability of the sample for each class in
the model, where classes are ordered as they are in
return self.estimator_.predict_log_proba(X)
def score(self, X, y, sample_weight=None):
"""Returns the output of the estimator's score function on the given
test data and labels.
X (array-like): Test samples.
y (array-like): True labels for X.
sample_weight (array-like, optional): Sample weights.
float: `self.estimator.score(X, y, sample_weight)`
return self.estimator_.score(X, y, sample_weight=sample_weight)