"""This is the helper script for implementing metrics.""" import numpy as np def compute_boolean_conditioning_vector(X, feature_names, condition=None): """Compute the boolean conditioning vector. Args: X (numpy.ndarray): Dataset features feature_names (list): Names of the features. condition (list(dict)): Specifies the subset of instances we want to use. Format is a list of `dicts` where the keys are `feature_names` and the values are values in `X`. Elements in the list are clauses joined with OR operators while key-value pairs in each dict are joined with AND operators. See examples for more details. If `None`, the condition specifies the entire set of instances, `X`. Returns: numpy.ndarray(bool): Boolean conditioning vector. Shape is `[n]` where `n` is `X.shape[0]`. Values are `True` if the corresponding row satisfies the `condition` and `False` otherwise. Examples: >>> condition = [{'sex': 1, 'age': 1}, {'sex': 0}] This corresponds to `(sex == 1 AND age == 1) OR (sex == 0)`. """ if condition is None: return np.ones(X.shape[0], dtype=bool) overall_cond = np.zeros(X.shape[0], dtype=bool) for group in condition: group_cond = np.ones(X.shape[0], dtype=bool) for name, val in group.items(): index = feature_names.index(name) group_cond = np.logical_and(group_cond, X[:, index] == val) overall_cond = np.logical_or(overall_cond, group_cond) return overall_cond def compute_num_instances(X, w, feature_names, condition=None): """Compute the number of instances, :math:`n`, conditioned on the protected attribute(s). Args: X (numpy.ndarray): Dataset features. w (numpy.ndarray): Instance weight vector. feature_names (list): Names of the features. condition (list(dict)): Same format as :func:`compute_boolean_conditioning_vector`. Returns: int: Number of instances (optionally conditioned). """ # condition if necessary cond_vec = compute_boolean_conditioning_vector(X, feature_names, condition) return np.sum(w[cond_vec], dtype=np.float64) def compute_num_pos_neg(X, y, w, feature_names, label, condition=None): """Compute the number of positives, :math:`P`, or negatives, :math:`N`, optionally conditioned on protected attributes. Args: X (numpy.ndarray): Dataset features. y (numpy.ndarray): Label vector. w (numpy.ndarray): Instance weight vector. feature_names (list): Names of the features. label (float): Value of label (unfavorable/positive or unfavorable/negative). condition (list(dict)): Same format as :func:`compute_boolean_conditioning_vector`. Returns: int: Number of positives/negatives (optionally conditioned) """ y = y.ravel() cond_vec = compute_boolean_conditioning_vector(X, feature_names, condition=condition) return np.sum(w[np.logical_and(y == label, cond_vec)], dtype=np.float64) def compute_num_TF_PN(X, y_true, y_pred, w, feature_names, favorable_label, unfavorable_label, condition=None): """Compute the number of true/false positives/negatives optionally conditioned on protected attributes. Args: X (numpy.ndarray): Dataset features. y_true (numpy.ndarray): True label vector. y_pred (numpy.ndarray): Predicted label vector. w (numpy.ndarray): Instance weight vector - the true and predicted datasets are supposed to have same instance level weights. feature_names (list): names of the features. favorable_label (float): Value of favorable/positive label. unfavorable_label (float): Value of unfavorable/negative label. condition (list(dict)): Same format as :func:`compute_boolean_conditioning_vector`. Returns: Number of positives/negatives (optionally conditioned). """ # condition if necessary cond_vec = compute_boolean_conditioning_vector(X, feature_names, condition=condition) # to prevent broadcasts y_true = y_true.ravel() y_pred = y_pred.ravel() y_true_pos = (y_true == favorable_label) y_true_neg = (y_true == unfavorable_label) y_pred_pos = np.logical_and(y_pred == favorable_label, cond_vec) y_pred_neg = np.logical_and(y_pred == unfavorable_label, cond_vec) # True/false positives/negatives return dict( TP=np.sum(w[np.logical_and(y_true_pos, y_pred_pos)], dtype=np.float64), FP=np.sum(w[np.logical_and(y_true_neg, y_pred_pos)], dtype=np.float64), TN=np.sum(w[np.logical_and(y_true_neg, y_pred_neg)], dtype=np.float64), FN=np.sum(w[np.logical_and(y_true_pos, y_pred_neg)], dtype=np.float64) ) def compute_num_gen_TF_PN(X, y_true, y_score, w, feature_names, favorable_label, unfavorable_label, condition=None): """Compute the number of generalized true/false positives/negatives optionally conditioned on protected attributes. Generalized counts are based on scores and not on the hard predictions. Args: X (numpy.ndarray): Dataset features. y_true (numpy.ndarray): True label vector. y_score (numpy.ndarray): Predicted score vector. Values range from 0 to 1. 0 implies prediction for unfavorable label and 1 implies prediction for favorable label. w (numpy.ndarray): Instance weight vector - the true and predicted datasets are supposed to have same instance level weights. feature_names (list): names of the features. favorable_label (float): Value of favorable/positive label. unfavorable_label (float): Value of unfavorable/negative label. condition (list(dict)): Same format as :func:`compute_boolean_conditioning_vector`. Returns: Number of positives/negatives (optionally conditioned). """ # condition if necessary cond_vec = compute_boolean_conditioning_vector(X, feature_names, condition=condition) # to prevent broadcasts y_true = y_true.ravel() y_score = y_score.ravel() w = w.ravel() y_true_pos = np.logical_and(y_true == favorable_label, cond_vec) y_true_neg = np.logical_and(y_true == unfavorable_label, cond_vec) # Generalized true/false positives/negatives return dict( GTP=np.sum((w*y_score)[y_true_pos], dtype=np.float64), GFP=np.sum((w*y_score)[y_true_neg], dtype=np.float64), GTN=np.sum((w*(1.0-y_score))[y_true_neg], dtype=np.float64), GFN=np.sum((w*(1.0-y_score))[y_true_pos], dtype=np.float64) ) def compute_distance(X_orig, X_distort, X_prot, feature_names, dist_fun, condition=None): """Compute the distance element-wise for two sets of vectors. Args: X_orig (numpy.ndarray): Original features. X_distort (numpy.ndarray): Distorted features. Shape must match `X_orig`. X_prot (numpy.ndarray): Protected attributes (used to compute condition). Should be same for both original and distorted. feature_names (list): Names of the protected features. dist_fun (function): Function which returns the distance (float) between two 1-D arrays (e.g. :func:`scipy.spatial.distance.euclidean`). condition (list(dict)): Same format as :func:`compute_boolean_conditioning_vector`. Returns: (numpy.ndarray(numpy.float64), numpy.ndarray(bool)): * Element-wise distances (1-D). * Condition vector (1-D). """ cond_vec = compute_boolean_conditioning_vector(X_prot, feature_names, condition=condition) num_instances = X_orig[cond_vec].shape[0] distance = np.zeros(num_instances, dtype=np.float64) for i in range(num_instances): distance[i] = dist_fun(X_orig[cond_vec][i], X_distort[cond_vec][i]) return distance, cond_vec