strexp / lime /discretize.py
markytools's picture
added strexp
d61b9c7
"""
Discretizers classes, to be used in lime_tabular
"""
import numpy as np
import sklearn
import sklearn.tree
import scipy
from sklearn.utils import check_random_state
from abc import ABCMeta, abstractmethod
class BaseDiscretizer():
"""
Abstract class - Build a class that inherits from this class to implement
a custom discretizer.
Method bins() is to be redefined in the child class, as it is the actual
custom part of the discretizer.
"""
__metaclass__ = ABCMeta # abstract class
def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None,
data_stats=None):
"""Initializer
Args:
data: numpy 2d array
categorical_features: list of indices (ints) corresponding to the
categorical columns. These features will not be discretized.
Everything else will be considered continuous, and will be
discretized.
categorical_names: map from int to list of names, where
categorical_names[x][y] represents the name of the yth value of
column x.
feature_names: list of names (strings) corresponding to the columns
in the training data.
data_stats: must have 'means', 'stds', 'mins' and 'maxs', use this
if you don't want these values to be computed from data
"""
self.to_discretize = ([x for x in range(data.shape[1])
if x not in categorical_features])
self.data_stats = data_stats
self.names = {}
self.lambdas = {}
self.means = {}
self.stds = {}
self.mins = {}
self.maxs = {}
self.random_state = check_random_state(random_state)
# To override when implementing a custom binning
bins = self.bins(data, labels)
bins = [np.unique(x) for x in bins]
# Read the stats from data_stats if exists
if data_stats:
self.means = self.data_stats.get("means")
self.stds = self.data_stats.get("stds")
self.mins = self.data_stats.get("mins")
self.maxs = self.data_stats.get("maxs")
for feature, qts in zip(self.to_discretize, bins):
n_bins = qts.shape[0] # Actually number of borders (= #bins-1)
boundaries = np.min(data[:, feature]), np.max(data[:, feature])
name = feature_names[feature]
self.names[feature] = ['%s <= %.2f' % (name, qts[0])]
for i in range(n_bins - 1):
self.names[feature].append('%.2f < %s <= %.2f' %
(qts[i], name, qts[i + 1]))
self.names[feature].append('%s > %.2f' % (name, qts[n_bins - 1]))
self.lambdas[feature] = lambda x, qts=qts: np.searchsorted(qts, x)
discretized = self.lambdas[feature](data[:, feature])
# If data stats are provided no need to compute the below set of details
if data_stats:
continue
self.means[feature] = []
self.stds[feature] = []
for x in range(n_bins + 1):
selection = data[discretized == x, feature]
mean = 0 if len(selection) == 0 else np.mean(selection)
self.means[feature].append(mean)
std = 0 if len(selection) == 0 else np.std(selection)
std += 0.00000000001
self.stds[feature].append(std)
self.mins[feature] = [boundaries[0]] + qts.tolist()
self.maxs[feature] = qts.tolist() + [boundaries[1]]
@abstractmethod
def bins(self, data, labels):
"""
To be overridden
Returns for each feature to discretize the boundaries
that form each bin of the discretizer
"""
raise NotImplementedError("Must override bins() method")
def discretize(self, data):
"""Discretizes the data.
Args:
data: numpy 2d or 1d array
Returns:
numpy array of same dimension, discretized.
"""
ret = data.copy()
for feature in self.lambdas:
if len(data.shape) == 1:
ret[feature] = int(self.lambdas[feature](ret[feature]))
else:
ret[:, feature] = self.lambdas[feature](
ret[:, feature]).astype(int)
return ret
def get_undiscretize_values(self, feature, values):
mins = np.array(self.mins[feature])[values]
maxs = np.array(self.maxs[feature])[values]
means = np.array(self.means[feature])[values]
stds = np.array(self.stds[feature])[values]
minz = (mins - means) / stds
maxz = (maxs - means) / stds
min_max_unequal = (minz != maxz)
ret = minz
ret[np.where(min_max_unequal)] = scipy.stats.truncnorm.rvs(
minz[min_max_unequal],
maxz[min_max_unequal],
loc=means[min_max_unequal],
scale=stds[min_max_unequal],
random_state=self.random_state
)
return ret
def undiscretize(self, data):
ret = data.copy()
for feature in self.means:
if len(data.shape) == 1:
ret[feature] = self.get_undiscretize_values(
feature, ret[feature].astype(int).reshape(-1, 1)
)
else:
ret[:, feature] = self.get_undiscretize_values(
feature, ret[:, feature].astype(int)
)
return ret
class StatsDiscretizer(BaseDiscretizer):
"""
Class to be used to supply the data stats info when discretize_continuous is true
"""
def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None,
data_stats=None):
BaseDiscretizer.__init__(self, data, categorical_features,
feature_names, labels=labels,
random_state=random_state,
data_stats=data_stats)
def bins(self, data, labels):
bins_from_stats = self.data_stats.get("bins")
bins = []
if bins_from_stats is not None:
for feature in self.to_discretize:
bins_from_stats_feature = bins_from_stats.get(feature)
if bins_from_stats_feature is not None:
qts = np.array(bins_from_stats_feature)
bins.append(qts)
return bins
class QuartileDiscretizer(BaseDiscretizer):
def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None):
BaseDiscretizer.__init__(self, data, categorical_features,
feature_names, labels=labels,
random_state=random_state)
def bins(self, data, labels):
bins = []
for feature in self.to_discretize:
qts = np.array(np.percentile(data[:, feature], [25, 50, 75]))
bins.append(qts)
return bins
class DecileDiscretizer(BaseDiscretizer):
def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None):
BaseDiscretizer.__init__(self, data, categorical_features,
feature_names, labels=labels,
random_state=random_state)
def bins(self, data, labels):
bins = []
for feature in self.to_discretize:
qts = np.array(np.percentile(data[:, feature],
[10, 20, 30, 40, 50, 60, 70, 80, 90]))
bins.append(qts)
return bins
class EntropyDiscretizer(BaseDiscretizer):
def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None):
if(labels is None):
raise ValueError('Labels must be not None when using \
EntropyDiscretizer')
BaseDiscretizer.__init__(self, data, categorical_features,
feature_names, labels=labels,
random_state=random_state)
def bins(self, data, labels):
bins = []
for feature in self.to_discretize:
# Entropy splitting / at most 8 bins so max_depth=3
dt = sklearn.tree.DecisionTreeClassifier(criterion='entropy',
max_depth=3,
random_state=self.random_state)
x = np.reshape(data[:, feature], (-1, 1))
dt.fit(x, labels)
qts = dt.tree_.threshold[np.where(dt.tree_.children_left > -1)]
if qts.shape[0] == 0:
qts = np.array([np.median(data[:, feature])])
else:
qts = np.sort(qts)
bins.append(qts)
return bins