File size: 4,623 Bytes
d2a8669
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os.path as osp

import pandas as pd
import numpy as np
try:
    from rpy2 import robjects
    from rpy2.robjects.vectors import StrVector
    from rpy2.robjects.packages import importr
    from rpy2.robjects import pandas2ri
    from rpy2.robjects.conversion import localconverter
except ImportError as error:
    from logging import warning
    warning("{}: FairAdapt will be unavailable. To install, run:\n"
            "pip install 'aif360[FairAdapt]'".format(error))
from sklearn.base import BaseEstimator

from aif360.sklearn.utils import check_inputs, check_groups


class FairAdapt(BaseEstimator):
    """Fair Data Adaptation.

    Fairadapt is a pre-processing technique that can be used for both fair
    classification and fair regression [#plecko20]_. The method is a causal
    inference approach to bias removal and it relies on the causal graph for
    the dataset. The original implementation is in R [#plecko21]_.

    References:
        .. [#plecko20] `D. Plečko and N. Meinshausen,
           "Fair Data Adaptation with Quantile Preservation,"
           Journal of Machine Learning Research, 2020.
           <https://www.jmlr.org/papers/volume21/19-966/19-966.pdf>`_
        .. [#plecko21] `D. Plečko and N. Bennett and N. Meinshausen,
           "FairAdapt: Causal Reasoning for Fair Data Pre-processing,"
           arXiv, 2021. <https://arxiv.org/abs/2110.10200>`_

    Attributes:
        prot_attr_ (str or list(str)): Protected attribute(s) used for fair data
            adaptation.
        groups_ (array, shape (n_groups,)): A list of group labels known to the
            transformer.
    """

    def __init__(self, prot_attr, adj_mat):
        """
        Args:
            prot_attr (single label): Name of the protected attribute. Must be
                binary.
            adj_mat (array-like): A 2-dimensional array representing the
                adjacency matrix of the causal diagram of the data generating
                process. Row/column order must match `X_train`.
        """
        self.prot_attr = prot_attr
        self.adj_mat = adj_mat

        # R packages need to run FairAdapt
        pkgs = ('ranger', 'fairadapt')
        # selectively install the missing packages
        pkgs = [p for p in pkgs if not robjects.packages.isinstalled(p)]
        if len(pkgs) > 0:
            utls = robjects.packages.importr('utils')
            utls.chooseCRANmirror(ind=1)
            utls.install_packages(StrVector(pkgs))

    def fit_transform(self, X_train, y_train, X_test):
        """Remove bias from the given dataset by fair adaptation.

        Args:
            X_train (pandas.DataFrame): Training data frame (including the
                protected attribute).
            y_train (pandas.Series): Training labels.
            X_test (pandas.DataFrame): Test data frame (including the protected
                attribute).

        Returns:
            tuple:
                Transformed inputs.

                * **X_fair_train** (pandas.DataFrame) -- Transformed training
                  data.
                * **y_fair_train** (array-like) -- Transformed training labels.
                * **X_fair_test** (pandas.DataFrame) -- Transformed test data.

        """
        # merge X_train and y_train
        df_train = pd.concat([X_train, y_train], axis=1)
        groups, self.prot_attr_ = check_groups(X_train, self.prot_attr, ensure_binary=True)
        self.groups_ = np.unique(groups)

        wrapper = osp.join(osp.dirname(osp.abspath(__file__)), 'fairadapt.R')
        robjects.r.source(wrapper)
        FairAdapt_R = robjects.r['wrapper']
        # convert to Pandas with a local converter
        with localconverter(robjects.default_converter + pandas2ri.converter):
            train_data = robjects.conversion.py2rpy(df_train)
            test_data = robjects.conversion.py2rpy(X_test)
            adj_mat = robjects.conversion.py2rpy(self.adj_mat)

        # run FairAdapt in R
        res = FairAdapt_R(
            train_data=train_data,
            test_data=test_data,
            adj_mat=adj_mat,
            prot_attr=self.prot_attr_,
            outcome=y_train.name
        )

        with localconverter(robjects.default_converter + pandas2ri.converter):
            X_fair_train = robjects.conversion.rpy2py(res.rx2('train'))
            X_fair_test = robjects.conversion.rpy2py(res.rx2('test'))
        X_fair_train.columns = [y_train.name] + X_train.columns.tolist()
        y_fair_train = X_fair_train.pop(y_train.name)
        X_fair_test.columns = X_test.columns

        return X_fair_train, y_fair_train, X_fair_test