Spaces:
Runtime error
Runtime error
"""This is the helper script for implementing metrics.""" | |
import numpy as np | |
def compute_boolean_conditioning_vector(X, feature_names, condition=None): | |
"""Compute the boolean conditioning vector. | |
Args: | |
X (numpy.ndarray): Dataset features | |
feature_names (list): Names of the features. | |
condition (list(dict)): Specifies the subset of instances we want to | |
use. Format is a list of `dicts` where the keys are `feature_names` | |
and the values are values in `X`. Elements in the list are clauses | |
joined with OR operators while key-value pairs in each dict are | |
joined with AND operators. See examples for more details. If `None`, | |
the condition specifies the entire set of instances, `X`. | |
Returns: | |
numpy.ndarray(bool): Boolean conditioning vector. Shape is `[n]` where | |
`n` is `X.shape[0]`. Values are `True` if the corresponding row | |
satisfies the `condition` and `False` otherwise. | |
Examples: | |
>>> condition = [{'sex': 1, 'age': 1}, {'sex': 0}] | |
This corresponds to `(sex == 1 AND age == 1) OR (sex == 0)`. | |
""" | |
if condition is None: | |
return np.ones(X.shape[0], dtype=bool) | |
overall_cond = np.zeros(X.shape[0], dtype=bool) | |
for group in condition: | |
group_cond = np.ones(X.shape[0], dtype=bool) | |
for name, val in group.items(): | |
index = feature_names.index(name) | |
group_cond = np.logical_and(group_cond, X[:, index] == val) | |
overall_cond = np.logical_or(overall_cond, group_cond) | |
return overall_cond | |
def compute_num_instances(X, w, feature_names, condition=None): | |
"""Compute the number of instances, :math:`n`, conditioned on the protected | |
attribute(s). | |
Args: | |
X (numpy.ndarray): Dataset features. | |
w (numpy.ndarray): Instance weight vector. | |
feature_names (list): Names of the features. | |
condition (list(dict)): Same format as | |
:func:`compute_boolean_conditioning_vector`. | |
Returns: | |
int: Number of instances (optionally conditioned). | |
""" | |
# condition if necessary | |
cond_vec = compute_boolean_conditioning_vector(X, feature_names, condition) | |
return np.sum(w[cond_vec], dtype=np.float64) | |
def compute_num_pos_neg(X, y, w, feature_names, label, condition=None): | |
"""Compute the number of positives, :math:`P`, or negatives, :math:`N`, | |
optionally conditioned on protected attributes. | |
Args: | |
X (numpy.ndarray): Dataset features. | |
y (numpy.ndarray): Label vector. | |
w (numpy.ndarray): Instance weight vector. | |
feature_names (list): Names of the features. | |
label (float): Value of label (unfavorable/positive or | |
unfavorable/negative). | |
condition (list(dict)): Same format as | |
:func:`compute_boolean_conditioning_vector`. | |
Returns: | |
int: Number of positives/negatives (optionally conditioned) | |
""" | |
y = y.ravel() | |
cond_vec = compute_boolean_conditioning_vector(X, feature_names, | |
condition=condition) | |
return np.sum(w[np.logical_and(y == label, cond_vec)], dtype=np.float64) | |
def compute_num_TF_PN(X, y_true, y_pred, w, feature_names, favorable_label, | |
unfavorable_label, condition=None): | |
"""Compute the number of true/false positives/negatives optionally | |
conditioned on protected attributes. | |
Args: | |
X (numpy.ndarray): Dataset features. | |
y_true (numpy.ndarray): True label vector. | |
y_pred (numpy.ndarray): Predicted label vector. | |
w (numpy.ndarray): Instance weight vector - the true and predicted | |
datasets are supposed to have same instance level weights. | |
feature_names (list): names of the features. | |
favorable_label (float): Value of favorable/positive label. | |
unfavorable_label (float): Value of unfavorable/negative label. | |
condition (list(dict)): Same format as | |
:func:`compute_boolean_conditioning_vector`. | |
Returns: | |
Number of positives/negatives (optionally conditioned). | |
""" | |
# condition if necessary | |
cond_vec = compute_boolean_conditioning_vector(X, feature_names, | |
condition=condition) | |
# to prevent broadcasts | |
y_true = y_true.ravel() | |
y_pred = y_pred.ravel() | |
y_true_pos = (y_true == favorable_label) | |
y_true_neg = (y_true == unfavorable_label) | |
y_pred_pos = np.logical_and(y_pred == favorable_label, cond_vec) | |
y_pred_neg = np.logical_and(y_pred == unfavorable_label, cond_vec) | |
# True/false positives/negatives | |
return dict( | |
TP=np.sum(w[np.logical_and(y_true_pos, y_pred_pos)], dtype=np.float64), | |
FP=np.sum(w[np.logical_and(y_true_neg, y_pred_pos)], dtype=np.float64), | |
TN=np.sum(w[np.logical_and(y_true_neg, y_pred_neg)], dtype=np.float64), | |
FN=np.sum(w[np.logical_and(y_true_pos, y_pred_neg)], dtype=np.float64) | |
) | |
def compute_num_gen_TF_PN(X, y_true, y_score, w, feature_names, favorable_label, | |
unfavorable_label, condition=None): | |
"""Compute the number of generalized true/false positives/negatives | |
optionally conditioned on protected attributes. Generalized counts are based | |
on scores and not on the hard predictions. | |
Args: | |
X (numpy.ndarray): Dataset features. | |
y_true (numpy.ndarray): True label vector. | |
y_score (numpy.ndarray): Predicted score vector. Values range from 0 to | |
1. 0 implies prediction for unfavorable label and 1 implies | |
prediction for favorable label. | |
w (numpy.ndarray): Instance weight vector - the true and predicted | |
datasets are supposed to have same instance level weights. | |
feature_names (list): names of the features. | |
favorable_label (float): Value of favorable/positive label. | |
unfavorable_label (float): Value of unfavorable/negative label. | |
condition (list(dict)): Same format as | |
:func:`compute_boolean_conditioning_vector`. | |
Returns: | |
Number of positives/negatives (optionally conditioned). | |
""" | |
# condition if necessary | |
cond_vec = compute_boolean_conditioning_vector(X, feature_names, | |
condition=condition) | |
# to prevent broadcasts | |
y_true = y_true.ravel() | |
y_score = y_score.ravel() | |
w = w.ravel() | |
y_true_pos = np.logical_and(y_true == favorable_label, cond_vec) | |
y_true_neg = np.logical_and(y_true == unfavorable_label, cond_vec) | |
# Generalized true/false positives/negatives | |
return dict( | |
GTP=np.sum((w*y_score)[y_true_pos], dtype=np.float64), | |
GFP=np.sum((w*y_score)[y_true_neg], dtype=np.float64), | |
GTN=np.sum((w*(1.0-y_score))[y_true_neg], dtype=np.float64), | |
GFN=np.sum((w*(1.0-y_score))[y_true_pos], dtype=np.float64) | |
) | |
def compute_distance(X_orig, X_distort, X_prot, feature_names, dist_fun, | |
condition=None): | |
"""Compute the distance element-wise for two sets of vectors. | |
Args: | |
X_orig (numpy.ndarray): Original features. | |
X_distort (numpy.ndarray): Distorted features. Shape must match | |
`X_orig`. | |
X_prot (numpy.ndarray): Protected attributes (used to compute | |
condition). Should be same for both original and distorted. | |
feature_names (list): Names of the protected features. | |
dist_fun (function): Function which returns the distance (float) between | |
two 1-D arrays (e.g. :func:`scipy.spatial.distance.euclidean`). | |
condition (list(dict)): Same format as | |
:func:`compute_boolean_conditioning_vector`. | |
Returns: | |
(numpy.ndarray(numpy.float64), numpy.ndarray(bool)): | |
* Element-wise distances (1-D). | |
* Condition vector (1-D). | |
""" | |
cond_vec = compute_boolean_conditioning_vector(X_prot, feature_names, | |
condition=condition) | |
num_instances = X_orig[cond_vec].shape[0] | |
distance = np.zeros(num_instances, dtype=np.float64) | |
for i in range(num_instances): | |
distance[i] = dist_fun(X_orig[cond_vec][i], X_distort[cond_vec][i]) | |
return distance, cond_vec | |