""" Discretizers classes, to be used in lime_tabular """ import numpy as np import sklearn import sklearn.tree import scipy from sklearn.utils import check_random_state from abc import ABCMeta, abstractmethod class BaseDiscretizer(): """ Abstract class - Build a class that inherits from this class to implement a custom discretizer. Method bins() is to be redefined in the child class, as it is the actual custom part of the discretizer. """ __metaclass__ = ABCMeta # abstract class def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None, data_stats=None): """Initializer Args: data: numpy 2d array categorical_features: list of indices (ints) corresponding to the categorical columns. These features will not be discretized. Everything else will be considered continuous, and will be discretized. categorical_names: map from int to list of names, where categorical_names[x][y] represents the name of the yth value of column x. feature_names: list of names (strings) corresponding to the columns in the training data. data_stats: must have 'means', 'stds', 'mins' and 'maxs', use this if you don't want these values to be computed from data """ self.to_discretize = ([x for x in range(data.shape[1]) if x not in categorical_features]) self.data_stats = data_stats self.names = {} self.lambdas = {} self.means = {} self.stds = {} self.mins = {} self.maxs = {} self.random_state = check_random_state(random_state) # To override when implementing a custom binning bins = self.bins(data, labels) bins = [np.unique(x) for x in bins] # Read the stats from data_stats if exists if data_stats: self.means = self.data_stats.get("means") self.stds = self.data_stats.get("stds") self.mins = self.data_stats.get("mins") self.maxs = self.data_stats.get("maxs") for feature, qts in zip(self.to_discretize, bins): n_bins = qts.shape[0] # Actually number of borders (= #bins-1) boundaries = np.min(data[:, feature]), np.max(data[:, feature]) name = feature_names[feature] self.names[feature] = ['%s <= %.2f' % (name, qts[0])] for i in range(n_bins - 1): self.names[feature].append('%.2f < %s <= %.2f' % (qts[i], name, qts[i + 1])) self.names[feature].append('%s > %.2f' % (name, qts[n_bins - 1])) self.lambdas[feature] = lambda x, qts=qts: np.searchsorted(qts, x) discretized = self.lambdas[feature](data[:, feature]) # If data stats are provided no need to compute the below set of details if data_stats: continue self.means[feature] = [] self.stds[feature] = [] for x in range(n_bins + 1): selection = data[discretized == x, feature] mean = 0 if len(selection) == 0 else np.mean(selection) self.means[feature].append(mean) std = 0 if len(selection) == 0 else np.std(selection) std += 0.00000000001 self.stds[feature].append(std) self.mins[feature] = [boundaries[0]] + qts.tolist() self.maxs[feature] = qts.tolist() + [boundaries[1]] @abstractmethod def bins(self, data, labels): """ To be overridden Returns for each feature to discretize the boundaries that form each bin of the discretizer """ raise NotImplementedError("Must override bins() method") def discretize(self, data): """Discretizes the data. Args: data: numpy 2d or 1d array Returns: numpy array of same dimension, discretized. """ ret = data.copy() for feature in self.lambdas: if len(data.shape) == 1: ret[feature] = int(self.lambdas[feature](ret[feature])) else: ret[:, feature] = self.lambdas[feature]( ret[:, feature]).astype(int) return ret def get_undiscretize_values(self, feature, values): mins = np.array(self.mins[feature])[values] maxs = np.array(self.maxs[feature])[values] means = np.array(self.means[feature])[values] stds = np.array(self.stds[feature])[values] minz = (mins - means) / stds maxz = (maxs - means) / stds min_max_unequal = (minz != maxz) ret = minz ret[np.where(min_max_unequal)] = scipy.stats.truncnorm.rvs( minz[min_max_unequal], maxz[min_max_unequal], loc=means[min_max_unequal], scale=stds[min_max_unequal], random_state=self.random_state ) return ret def undiscretize(self, data): ret = data.copy() for feature in self.means: if len(data.shape) == 1: ret[feature] = self.get_undiscretize_values( feature, ret[feature].astype(int).reshape(-1, 1) ) else: ret[:, feature] = self.get_undiscretize_values( feature, ret[:, feature].astype(int) ) return ret class StatsDiscretizer(BaseDiscretizer): """ Class to be used to supply the data stats info when discretize_continuous is true """ def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None, data_stats=None): BaseDiscretizer.__init__(self, data, categorical_features, feature_names, labels=labels, random_state=random_state, data_stats=data_stats) def bins(self, data, labels): bins_from_stats = self.data_stats.get("bins") bins = [] if bins_from_stats is not None: for feature in self.to_discretize: bins_from_stats_feature = bins_from_stats.get(feature) if bins_from_stats_feature is not None: qts = np.array(bins_from_stats_feature) bins.append(qts) return bins class QuartileDiscretizer(BaseDiscretizer): def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None): BaseDiscretizer.__init__(self, data, categorical_features, feature_names, labels=labels, random_state=random_state) def bins(self, data, labels): bins = [] for feature in self.to_discretize: qts = np.array(np.percentile(data[:, feature], [25, 50, 75])) bins.append(qts) return bins class DecileDiscretizer(BaseDiscretizer): def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None): BaseDiscretizer.__init__(self, data, categorical_features, feature_names, labels=labels, random_state=random_state) def bins(self, data, labels): bins = [] for feature in self.to_discretize: qts = np.array(np.percentile(data[:, feature], [10, 20, 30, 40, 50, 60, 70, 80, 90])) bins.append(qts) return bins class EntropyDiscretizer(BaseDiscretizer): def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None): if(labels is None): raise ValueError('Labels must be not None when using \ EntropyDiscretizer') BaseDiscretizer.__init__(self, data, categorical_features, feature_names, labels=labels, random_state=random_state) def bins(self, data, labels): bins = [] for feature in self.to_discretize: # Entropy splitting / at most 8 bins so max_depth=3 dt = sklearn.tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=self.random_state) x = np.reshape(data[:, feature], (-1, 1)) dt.fit(x, labels) qts = dt.tree_.threshold[np.where(dt.tree_.children_left > -1)] if qts.shape[0] == 0: qts = np.array([np.median(data[:, feature])]) else: qts = np.sort(qts) bins.append(qts) return bins