import numpy as np import warnings class SubmodularPick(object): """Class for submodular pick Saves a representative sample of explanation objects using SP-LIME, as well as saving all generated explanations First, a collection of candidate explanations are generated (see explain_instance). From these candidates, num_exps_desired are chosen using submodular pick. (see marcotcr et al paper).""" def __init__(self, explainer, data, predict_fn, method='sample', sample_size=1000, num_exps_desired=5, num_features=10, **kwargs): """ Args: data: a numpy array where each row is a single input into predict_fn predict_fn: prediction function. For classifiers, this should be a function that takes a numpy array and outputs prediction probabilities. For regressors, this takes a numpy array and returns the predictions. For ScikitClassifiers, this is `classifier.predict_proba()`. For ScikitRegressors, this is `regressor.predict()`. The prediction function needs to work on multiple feature vectors (the vectors randomly perturbed from the data_row). method: The method to use to generate candidate explanations method == 'sample' will sample the data uniformly at random. The sample size is given by sample_size. Otherwise if method == 'full' then explanations will be generated for the entire data. l sample_size: The number of instances to explain if method == 'sample' num_exps_desired: The number of explanation objects returned num_features: maximum number of features present in explanation Sets value: sp_explanations: A list of explanation objects that has a high coverage explanations: All the candidate explanations saved for potential future use. """ top_labels = kwargs.get('top_labels', 1) if 'top_labels' in kwargs: del kwargs['top_labels'] # Parse args if method == 'sample': if sample_size > len(data): warnings.warn("""Requested sample size larger than size of input data. Using all data""") sample_size = len(data) all_indices = np.arange(len(data)) np.random.shuffle(all_indices) sample_indices = all_indices[:sample_size] elif method == 'full': sample_indices = np.arange(len(data)) else: raise ValueError('Method must be \'sample\' or \'full\'') # Generate Explanations self.explanations = [] for i in sample_indices: self.explanations.append( explainer.explain_instance( data[i], predict_fn, num_features=num_features, top_labels=top_labels, **kwargs)) # Error handling try: num_exps_desired = int(num_exps_desired) except TypeError: return("Requested number of explanations should be an integer") if num_exps_desired > len(self.explanations): warnings.warn("""Requested number of explanations larger than total number of explanations, returning all explanations instead.""") num_exps_desired = min(num_exps_desired, len(self.explanations)) # Find all the explanation model features used. Defines the dimension d' features_dict = {} feature_iter = 0 for exp in self.explanations: labels = exp.available_labels() if exp.mode == 'classification' else [1] for label in labels: for feature, _ in exp.as_list(label=label): if feature not in features_dict.keys(): features_dict[feature] = (feature_iter) feature_iter += 1 d_prime = len(features_dict.keys()) # Create the n x d' dimensional 'explanation matrix', W W = np.zeros((len(self.explanations), d_prime)) for i, exp in enumerate(self.explanations): labels = exp.available_labels() if exp.mode == 'classification' else [1] for label in labels: for feature, value in exp.as_list(label): W[i, features_dict[feature]] += value # Create the global importance vector, I_j described in the paper importance = np.sum(abs(W), axis=0)**.5 # Now run the SP-LIME greedy algorithm remaining_indices = set(range(len(self.explanations))) V = [] for _ in range(num_exps_desired): best = 0 best_ind = None current = 0 for i in remaining_indices: current = np.dot( (np.sum(abs(W)[V + [i]], axis=0) > 0), importance ) # coverage function if current >= best: best = current best_ind = i V.append(best_ind) remaining_indices -= {best_ind} self.sp_explanations = [self.explanations[i] for i in V] self.V = V