Spaces:
Runtime error
Runtime error
File size: 8,113 Bytes
d2a8669 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 |
"""This is the helper script for implementing metrics."""
import numpy as np
def compute_boolean_conditioning_vector(X, feature_names, condition=None):
"""Compute the boolean conditioning vector.
Args:
X (numpy.ndarray): Dataset features
feature_names (list): Names of the features.
condition (list(dict)): Specifies the subset of instances we want to
use. Format is a list of `dicts` where the keys are `feature_names`
and the values are values in `X`. Elements in the list are clauses
joined with OR operators while key-value pairs in each dict are
joined with AND operators. See examples for more details. If `None`,
the condition specifies the entire set of instances, `X`.
Returns:
numpy.ndarray(bool): Boolean conditioning vector. Shape is `[n]` where
`n` is `X.shape[0]`. Values are `True` if the corresponding row
satisfies the `condition` and `False` otherwise.
Examples:
>>> condition = [{'sex': 1, 'age': 1}, {'sex': 0}]
This corresponds to `(sex == 1 AND age == 1) OR (sex == 0)`.
"""
if condition is None:
return np.ones(X.shape[0], dtype=bool)
overall_cond = np.zeros(X.shape[0], dtype=bool)
for group in condition:
group_cond = np.ones(X.shape[0], dtype=bool)
for name, val in group.items():
index = feature_names.index(name)
group_cond = np.logical_and(group_cond, X[:, index] == val)
overall_cond = np.logical_or(overall_cond, group_cond)
return overall_cond
def compute_num_instances(X, w, feature_names, condition=None):
"""Compute the number of instances, :math:`n`, conditioned on the protected
attribute(s).
Args:
X (numpy.ndarray): Dataset features.
w (numpy.ndarray): Instance weight vector.
feature_names (list): Names of the features.
condition (list(dict)): Same format as
:func:`compute_boolean_conditioning_vector`.
Returns:
int: Number of instances (optionally conditioned).
"""
# condition if necessary
cond_vec = compute_boolean_conditioning_vector(X, feature_names, condition)
return np.sum(w[cond_vec], dtype=np.float64)
def compute_num_pos_neg(X, y, w, feature_names, label, condition=None):
"""Compute the number of positives, :math:`P`, or negatives, :math:`N`,
optionally conditioned on protected attributes.
Args:
X (numpy.ndarray): Dataset features.
y (numpy.ndarray): Label vector.
w (numpy.ndarray): Instance weight vector.
feature_names (list): Names of the features.
label (float): Value of label (unfavorable/positive or
unfavorable/negative).
condition (list(dict)): Same format as
:func:`compute_boolean_conditioning_vector`.
Returns:
int: Number of positives/negatives (optionally conditioned)
"""
y = y.ravel()
cond_vec = compute_boolean_conditioning_vector(X, feature_names,
condition=condition)
return np.sum(w[np.logical_and(y == label, cond_vec)], dtype=np.float64)
def compute_num_TF_PN(X, y_true, y_pred, w, feature_names, favorable_label,
unfavorable_label, condition=None):
"""Compute the number of true/false positives/negatives optionally
conditioned on protected attributes.
Args:
X (numpy.ndarray): Dataset features.
y_true (numpy.ndarray): True label vector.
y_pred (numpy.ndarray): Predicted label vector.
w (numpy.ndarray): Instance weight vector - the true and predicted
datasets are supposed to have same instance level weights.
feature_names (list): names of the features.
favorable_label (float): Value of favorable/positive label.
unfavorable_label (float): Value of unfavorable/negative label.
condition (list(dict)): Same format as
:func:`compute_boolean_conditioning_vector`.
Returns:
Number of positives/negatives (optionally conditioned).
"""
# condition if necessary
cond_vec = compute_boolean_conditioning_vector(X, feature_names,
condition=condition)
# to prevent broadcasts
y_true = y_true.ravel()
y_pred = y_pred.ravel()
y_true_pos = (y_true == favorable_label)
y_true_neg = (y_true == unfavorable_label)
y_pred_pos = np.logical_and(y_pred == favorable_label, cond_vec)
y_pred_neg = np.logical_and(y_pred == unfavorable_label, cond_vec)
# True/false positives/negatives
return dict(
TP=np.sum(w[np.logical_and(y_true_pos, y_pred_pos)], dtype=np.float64),
FP=np.sum(w[np.logical_and(y_true_neg, y_pred_pos)], dtype=np.float64),
TN=np.sum(w[np.logical_and(y_true_neg, y_pred_neg)], dtype=np.float64),
FN=np.sum(w[np.logical_and(y_true_pos, y_pred_neg)], dtype=np.float64)
)
def compute_num_gen_TF_PN(X, y_true, y_score, w, feature_names, favorable_label,
unfavorable_label, condition=None):
"""Compute the number of generalized true/false positives/negatives
optionally conditioned on protected attributes. Generalized counts are based
on scores and not on the hard predictions.
Args:
X (numpy.ndarray): Dataset features.
y_true (numpy.ndarray): True label vector.
y_score (numpy.ndarray): Predicted score vector. Values range from 0 to
1. 0 implies prediction for unfavorable label and 1 implies
prediction for favorable label.
w (numpy.ndarray): Instance weight vector - the true and predicted
datasets are supposed to have same instance level weights.
feature_names (list): names of the features.
favorable_label (float): Value of favorable/positive label.
unfavorable_label (float): Value of unfavorable/negative label.
condition (list(dict)): Same format as
:func:`compute_boolean_conditioning_vector`.
Returns:
Number of positives/negatives (optionally conditioned).
"""
# condition if necessary
cond_vec = compute_boolean_conditioning_vector(X, feature_names,
condition=condition)
# to prevent broadcasts
y_true = y_true.ravel()
y_score = y_score.ravel()
w = w.ravel()
y_true_pos = np.logical_and(y_true == favorable_label, cond_vec)
y_true_neg = np.logical_and(y_true == unfavorable_label, cond_vec)
# Generalized true/false positives/negatives
return dict(
GTP=np.sum((w*y_score)[y_true_pos], dtype=np.float64),
GFP=np.sum((w*y_score)[y_true_neg], dtype=np.float64),
GTN=np.sum((w*(1.0-y_score))[y_true_neg], dtype=np.float64),
GFN=np.sum((w*(1.0-y_score))[y_true_pos], dtype=np.float64)
)
def compute_distance(X_orig, X_distort, X_prot, feature_names, dist_fun,
condition=None):
"""Compute the distance element-wise for two sets of vectors.
Args:
X_orig (numpy.ndarray): Original features.
X_distort (numpy.ndarray): Distorted features. Shape must match
`X_orig`.
X_prot (numpy.ndarray): Protected attributes (used to compute
condition). Should be same for both original and distorted.
feature_names (list): Names of the protected features.
dist_fun (function): Function which returns the distance (float) between
two 1-D arrays (e.g. :func:`scipy.spatial.distance.euclidean`).
condition (list(dict)): Same format as
:func:`compute_boolean_conditioning_vector`.
Returns:
(numpy.ndarray(numpy.float64), numpy.ndarray(bool)):
* Element-wise distances (1-D).
* Condition vector (1-D).
"""
cond_vec = compute_boolean_conditioning_vector(X_prot, feature_names,
condition=condition)
num_instances = X_orig[cond_vec].shape[0]
distance = np.zeros(num_instances, dtype=np.float64)
for i in range(num_instances):
distance[i] = dist_fun(X_orig[cond_vec][i], X_distort[cond_vec][i])
return distance, cond_vec
|