File size: 8,113 Bytes
d2a8669
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
"""This is the helper script for implementing metrics."""
import numpy as np


def compute_boolean_conditioning_vector(X, feature_names, condition=None):
    """Compute the boolean conditioning vector.

    Args:
        X (numpy.ndarray): Dataset features
        feature_names (list): Names of the features.
        condition (list(dict)): Specifies the subset of instances we want to
            use. Format is a list of `dicts` where the keys are `feature_names`
            and the values are values in `X`. Elements in the list are clauses
            joined with OR operators while key-value pairs in each dict are
            joined with AND operators. See examples for more details. If `None`,
            the condition specifies the entire set of instances, `X`.

    Returns:
        numpy.ndarray(bool): Boolean conditioning vector. Shape is `[n]` where
        `n` is `X.shape[0]`. Values are `True` if the corresponding row
        satisfies the `condition` and `False` otherwise.

    Examples:
        >>> condition = [{'sex': 1, 'age': 1}, {'sex': 0}]

        This corresponds to `(sex == 1 AND age == 1) OR (sex == 0)`.
    """
    if condition is None:
        return np.ones(X.shape[0], dtype=bool)

    overall_cond = np.zeros(X.shape[0], dtype=bool)
    for group in condition:
        group_cond = np.ones(X.shape[0], dtype=bool)
        for name, val in group.items():
            index = feature_names.index(name)
            group_cond = np.logical_and(group_cond, X[:, index] == val)
        overall_cond = np.logical_or(overall_cond, group_cond)

    return overall_cond

def compute_num_instances(X, w, feature_names, condition=None):
    """Compute the number of instances, :math:`n`, conditioned on the protected
    attribute(s).

    Args:
        X (numpy.ndarray): Dataset features.
        w (numpy.ndarray): Instance weight vector.
        feature_names (list): Names of the features.
        condition (list(dict)): Same format as
            :func:`compute_boolean_conditioning_vector`.

    Returns:
        int: Number of instances (optionally conditioned).
    """

    # condition if necessary
    cond_vec = compute_boolean_conditioning_vector(X, feature_names, condition)

    return np.sum(w[cond_vec], dtype=np.float64)

def compute_num_pos_neg(X, y, w, feature_names, label, condition=None):
    """Compute the number of positives, :math:`P`, or negatives, :math:`N`,
    optionally conditioned on protected attributes.

    Args:
        X (numpy.ndarray): Dataset features.
        y (numpy.ndarray): Label vector.
        w (numpy.ndarray): Instance weight vector.
        feature_names (list): Names of the features.
        label (float): Value of label (unfavorable/positive or
            unfavorable/negative).
        condition (list(dict)): Same format as
            :func:`compute_boolean_conditioning_vector`.

    Returns:
        int: Number of positives/negatives (optionally conditioned)
    """
    y = y.ravel()
    cond_vec = compute_boolean_conditioning_vector(X, feature_names,
        condition=condition)
    return np.sum(w[np.logical_and(y == label, cond_vec)], dtype=np.float64)

def compute_num_TF_PN(X, y_true, y_pred, w, feature_names, favorable_label,
                      unfavorable_label, condition=None):
    """Compute the number of true/false positives/negatives optionally
    conditioned on protected attributes.

    Args:
        X (numpy.ndarray): Dataset features.
        y_true (numpy.ndarray): True label vector.
        y_pred (numpy.ndarray): Predicted label vector.
        w (numpy.ndarray): Instance weight vector - the true and predicted
            datasets are supposed to have same instance level weights.
        feature_names (list): names of the features.
        favorable_label (float): Value of favorable/positive label.
        unfavorable_label (float): Value of unfavorable/negative label.
        condition (list(dict)): Same format as
            :func:`compute_boolean_conditioning_vector`.

    Returns:
        Number of positives/negatives (optionally conditioned).
    """
    # condition if necessary
    cond_vec = compute_boolean_conditioning_vector(X, feature_names,
        condition=condition)

    # to prevent broadcasts
    y_true = y_true.ravel()
    y_pred = y_pred.ravel()

    y_true_pos = (y_true == favorable_label)
    y_true_neg = (y_true == unfavorable_label)
    y_pred_pos = np.logical_and(y_pred == favorable_label, cond_vec)
    y_pred_neg = np.logical_and(y_pred == unfavorable_label, cond_vec)

    # True/false positives/negatives
    return dict(
        TP=np.sum(w[np.logical_and(y_true_pos, y_pred_pos)], dtype=np.float64),
        FP=np.sum(w[np.logical_and(y_true_neg, y_pred_pos)], dtype=np.float64),
        TN=np.sum(w[np.logical_and(y_true_neg, y_pred_neg)], dtype=np.float64),
        FN=np.sum(w[np.logical_and(y_true_pos, y_pred_neg)], dtype=np.float64)
    )

def compute_num_gen_TF_PN(X, y_true, y_score, w, feature_names, favorable_label,
                    unfavorable_label, condition=None):
    """Compute the number of generalized true/false positives/negatives
    optionally conditioned on protected attributes. Generalized counts are based
    on scores and not on the hard predictions.

    Args:
        X (numpy.ndarray): Dataset features.
        y_true (numpy.ndarray): True label vector.
        y_score (numpy.ndarray): Predicted score vector. Values range from 0 to
            1. 0 implies prediction for unfavorable label and 1 implies
            prediction for favorable label.
        w (numpy.ndarray): Instance weight vector - the true and predicted
            datasets are supposed to have same instance level weights.
        feature_names (list): names of the features.
        favorable_label (float): Value of favorable/positive label.
        unfavorable_label (float): Value of unfavorable/negative label.
        condition (list(dict)): Same format as
            :func:`compute_boolean_conditioning_vector`.

    Returns:
        Number of positives/negatives (optionally conditioned).
    """
    # condition if necessary
    cond_vec = compute_boolean_conditioning_vector(X, feature_names,
        condition=condition)

    # to prevent broadcasts
    y_true = y_true.ravel()
    y_score = y_score.ravel()
    w = w.ravel()

    y_true_pos = np.logical_and(y_true == favorable_label, cond_vec)
    y_true_neg = np.logical_and(y_true == unfavorable_label, cond_vec)

    # Generalized true/false positives/negatives
    return dict(
        GTP=np.sum((w*y_score)[y_true_pos], dtype=np.float64),
        GFP=np.sum((w*y_score)[y_true_neg], dtype=np.float64),
        GTN=np.sum((w*(1.0-y_score))[y_true_neg], dtype=np.float64),
        GFN=np.sum((w*(1.0-y_score))[y_true_pos], dtype=np.float64)
    )

def compute_distance(X_orig, X_distort, X_prot, feature_names, dist_fun,
                     condition=None):
    """Compute the distance element-wise for two sets of vectors.

    Args:
        X_orig (numpy.ndarray): Original features.
        X_distort (numpy.ndarray): Distorted features. Shape must match
            `X_orig`.
        X_prot (numpy.ndarray): Protected attributes (used to compute
            condition). Should be same for both original and distorted.
        feature_names (list): Names of the protected features.
        dist_fun (function): Function which returns the distance (float) between
            two 1-D arrays (e.g. :func:`scipy.spatial.distance.euclidean`).
        condition (list(dict)): Same format as
            :func:`compute_boolean_conditioning_vector`.

    Returns:
        (numpy.ndarray(numpy.float64), numpy.ndarray(bool)):

            * Element-wise distances (1-D).
            * Condition vector (1-D).
    """
    cond_vec = compute_boolean_conditioning_vector(X_prot, feature_names,
        condition=condition)

    num_instances = X_orig[cond_vec].shape[0]
    distance = np.zeros(num_instances, dtype=np.float64)
    for i in range(num_instances):
        distance[i] = dist_fun(X_orig[cond_vec][i], X_distort[cond_vec][i])

    return distance, cond_vec