Spaces:
Build error
Build error
""" | |
Discretizers classes, to be used in lime_tabular | |
""" | |
import numpy as np | |
import sklearn | |
import sklearn.tree | |
import scipy | |
from sklearn.utils import check_random_state | |
from abc import ABCMeta, abstractmethod | |
class BaseDiscretizer(): | |
""" | |
Abstract class - Build a class that inherits from this class to implement | |
a custom discretizer. | |
Method bins() is to be redefined in the child class, as it is the actual | |
custom part of the discretizer. | |
""" | |
__metaclass__ = ABCMeta # abstract class | |
def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None, | |
data_stats=None): | |
"""Initializer | |
Args: | |
data: numpy 2d array | |
categorical_features: list of indices (ints) corresponding to the | |
categorical columns. These features will not be discretized. | |
Everything else will be considered continuous, and will be | |
discretized. | |
categorical_names: map from int to list of names, where | |
categorical_names[x][y] represents the name of the yth value of | |
column x. | |
feature_names: list of names (strings) corresponding to the columns | |
in the training data. | |
data_stats: must have 'means', 'stds', 'mins' and 'maxs', use this | |
if you don't want these values to be computed from data | |
""" | |
self.to_discretize = ([x for x in range(data.shape[1]) | |
if x not in categorical_features]) | |
self.data_stats = data_stats | |
self.names = {} | |
self.lambdas = {} | |
self.means = {} | |
self.stds = {} | |
self.mins = {} | |
self.maxs = {} | |
self.random_state = check_random_state(random_state) | |
# To override when implementing a custom binning | |
bins = self.bins(data, labels) | |
bins = [np.unique(x) for x in bins] | |
# Read the stats from data_stats if exists | |
if data_stats: | |
self.means = self.data_stats.get("means") | |
self.stds = self.data_stats.get("stds") | |
self.mins = self.data_stats.get("mins") | |
self.maxs = self.data_stats.get("maxs") | |
for feature, qts in zip(self.to_discretize, bins): | |
n_bins = qts.shape[0] # Actually number of borders (= #bins-1) | |
boundaries = np.min(data[:, feature]), np.max(data[:, feature]) | |
name = feature_names[feature] | |
self.names[feature] = ['%s <= %.2f' % (name, qts[0])] | |
for i in range(n_bins - 1): | |
self.names[feature].append('%.2f < %s <= %.2f' % | |
(qts[i], name, qts[i + 1])) | |
self.names[feature].append('%s > %.2f' % (name, qts[n_bins - 1])) | |
self.lambdas[feature] = lambda x, qts=qts: np.searchsorted(qts, x) | |
discretized = self.lambdas[feature](data[:, feature]) | |
# If data stats are provided no need to compute the below set of details | |
if data_stats: | |
continue | |
self.means[feature] = [] | |
self.stds[feature] = [] | |
for x in range(n_bins + 1): | |
selection = data[discretized == x, feature] | |
mean = 0 if len(selection) == 0 else np.mean(selection) | |
self.means[feature].append(mean) | |
std = 0 if len(selection) == 0 else np.std(selection) | |
std += 0.00000000001 | |
self.stds[feature].append(std) | |
self.mins[feature] = [boundaries[0]] + qts.tolist() | |
self.maxs[feature] = qts.tolist() + [boundaries[1]] | |
def bins(self, data, labels): | |
""" | |
To be overridden | |
Returns for each feature to discretize the boundaries | |
that form each bin of the discretizer | |
""" | |
raise NotImplementedError("Must override bins() method") | |
def discretize(self, data): | |
"""Discretizes the data. | |
Args: | |
data: numpy 2d or 1d array | |
Returns: | |
numpy array of same dimension, discretized. | |
""" | |
ret = data.copy() | |
for feature in self.lambdas: | |
if len(data.shape) == 1: | |
ret[feature] = int(self.lambdas[feature](ret[feature])) | |
else: | |
ret[:, feature] = self.lambdas[feature]( | |
ret[:, feature]).astype(int) | |
return ret | |
def get_undiscretize_values(self, feature, values): | |
mins = np.array(self.mins[feature])[values] | |
maxs = np.array(self.maxs[feature])[values] | |
means = np.array(self.means[feature])[values] | |
stds = np.array(self.stds[feature])[values] | |
minz = (mins - means) / stds | |
maxz = (maxs - means) / stds | |
min_max_unequal = (minz != maxz) | |
ret = minz | |
ret[np.where(min_max_unequal)] = scipy.stats.truncnorm.rvs( | |
minz[min_max_unequal], | |
maxz[min_max_unequal], | |
loc=means[min_max_unequal], | |
scale=stds[min_max_unequal], | |
random_state=self.random_state | |
) | |
return ret | |
def undiscretize(self, data): | |
ret = data.copy() | |
for feature in self.means: | |
if len(data.shape) == 1: | |
ret[feature] = self.get_undiscretize_values( | |
feature, ret[feature].astype(int).reshape(-1, 1) | |
) | |
else: | |
ret[:, feature] = self.get_undiscretize_values( | |
feature, ret[:, feature].astype(int) | |
) | |
return ret | |
class StatsDiscretizer(BaseDiscretizer): | |
""" | |
Class to be used to supply the data stats info when discretize_continuous is true | |
""" | |
def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None, | |
data_stats=None): | |
BaseDiscretizer.__init__(self, data, categorical_features, | |
feature_names, labels=labels, | |
random_state=random_state, | |
data_stats=data_stats) | |
def bins(self, data, labels): | |
bins_from_stats = self.data_stats.get("bins") | |
bins = [] | |
if bins_from_stats is not None: | |
for feature in self.to_discretize: | |
bins_from_stats_feature = bins_from_stats.get(feature) | |
if bins_from_stats_feature is not None: | |
qts = np.array(bins_from_stats_feature) | |
bins.append(qts) | |
return bins | |
class QuartileDiscretizer(BaseDiscretizer): | |
def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None): | |
BaseDiscretizer.__init__(self, data, categorical_features, | |
feature_names, labels=labels, | |
random_state=random_state) | |
def bins(self, data, labels): | |
bins = [] | |
for feature in self.to_discretize: | |
qts = np.array(np.percentile(data[:, feature], [25, 50, 75])) | |
bins.append(qts) | |
return bins | |
class DecileDiscretizer(BaseDiscretizer): | |
def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None): | |
BaseDiscretizer.__init__(self, data, categorical_features, | |
feature_names, labels=labels, | |
random_state=random_state) | |
def bins(self, data, labels): | |
bins = [] | |
for feature in self.to_discretize: | |
qts = np.array(np.percentile(data[:, feature], | |
[10, 20, 30, 40, 50, 60, 70, 80, 90])) | |
bins.append(qts) | |
return bins | |
class EntropyDiscretizer(BaseDiscretizer): | |
def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None): | |
if(labels is None): | |
raise ValueError('Labels must be not None when using \ | |
EntropyDiscretizer') | |
BaseDiscretizer.__init__(self, data, categorical_features, | |
feature_names, labels=labels, | |
random_state=random_state) | |
def bins(self, data, labels): | |
bins = [] | |
for feature in self.to_discretize: | |
# Entropy splitting / at most 8 bins so max_depth=3 | |
dt = sklearn.tree.DecisionTreeClassifier(criterion='entropy', | |
max_depth=3, | |
random_state=self.random_state) | |
x = np.reshape(data[:, feature], (-1, 1)) | |
dt.fit(x, labels) | |
qts = dt.tree_.threshold[np.where(dt.tree_.children_left > -1)] | |
if qts.shape[0] == 0: | |
qts = np.array([np.median(data[:, feature])]) | |
else: | |
qts = np.sort(qts) | |
bins.append(qts) | |
return bins | |