Spaces:
Build error
Build error
File size: 8,978 Bytes
d61b9c7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 |
"""
Discretizers classes, to be used in lime_tabular
"""
import numpy as np
import sklearn
import sklearn.tree
import scipy
from sklearn.utils import check_random_state
from abc import ABCMeta, abstractmethod
class BaseDiscretizer():
"""
Abstract class - Build a class that inherits from this class to implement
a custom discretizer.
Method bins() is to be redefined in the child class, as it is the actual
custom part of the discretizer.
"""
__metaclass__ = ABCMeta # abstract class
def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None,
data_stats=None):
"""Initializer
Args:
data: numpy 2d array
categorical_features: list of indices (ints) corresponding to the
categorical columns. These features will not be discretized.
Everything else will be considered continuous, and will be
discretized.
categorical_names: map from int to list of names, where
categorical_names[x][y] represents the name of the yth value of
column x.
feature_names: list of names (strings) corresponding to the columns
in the training data.
data_stats: must have 'means', 'stds', 'mins' and 'maxs', use this
if you don't want these values to be computed from data
"""
self.to_discretize = ([x for x in range(data.shape[1])
if x not in categorical_features])
self.data_stats = data_stats
self.names = {}
self.lambdas = {}
self.means = {}
self.stds = {}
self.mins = {}
self.maxs = {}
self.random_state = check_random_state(random_state)
# To override when implementing a custom binning
bins = self.bins(data, labels)
bins = [np.unique(x) for x in bins]
# Read the stats from data_stats if exists
if data_stats:
self.means = self.data_stats.get("means")
self.stds = self.data_stats.get("stds")
self.mins = self.data_stats.get("mins")
self.maxs = self.data_stats.get("maxs")
for feature, qts in zip(self.to_discretize, bins):
n_bins = qts.shape[0] # Actually number of borders (= #bins-1)
boundaries = np.min(data[:, feature]), np.max(data[:, feature])
name = feature_names[feature]
self.names[feature] = ['%s <= %.2f' % (name, qts[0])]
for i in range(n_bins - 1):
self.names[feature].append('%.2f < %s <= %.2f' %
(qts[i], name, qts[i + 1]))
self.names[feature].append('%s > %.2f' % (name, qts[n_bins - 1]))
self.lambdas[feature] = lambda x, qts=qts: np.searchsorted(qts, x)
discretized = self.lambdas[feature](data[:, feature])
# If data stats are provided no need to compute the below set of details
if data_stats:
continue
self.means[feature] = []
self.stds[feature] = []
for x in range(n_bins + 1):
selection = data[discretized == x, feature]
mean = 0 if len(selection) == 0 else np.mean(selection)
self.means[feature].append(mean)
std = 0 if len(selection) == 0 else np.std(selection)
std += 0.00000000001
self.stds[feature].append(std)
self.mins[feature] = [boundaries[0]] + qts.tolist()
self.maxs[feature] = qts.tolist() + [boundaries[1]]
@abstractmethod
def bins(self, data, labels):
"""
To be overridden
Returns for each feature to discretize the boundaries
that form each bin of the discretizer
"""
raise NotImplementedError("Must override bins() method")
def discretize(self, data):
"""Discretizes the data.
Args:
data: numpy 2d or 1d array
Returns:
numpy array of same dimension, discretized.
"""
ret = data.copy()
for feature in self.lambdas:
if len(data.shape) == 1:
ret[feature] = int(self.lambdas[feature](ret[feature]))
else:
ret[:, feature] = self.lambdas[feature](
ret[:, feature]).astype(int)
return ret
def get_undiscretize_values(self, feature, values):
mins = np.array(self.mins[feature])[values]
maxs = np.array(self.maxs[feature])[values]
means = np.array(self.means[feature])[values]
stds = np.array(self.stds[feature])[values]
minz = (mins - means) / stds
maxz = (maxs - means) / stds
min_max_unequal = (minz != maxz)
ret = minz
ret[np.where(min_max_unequal)] = scipy.stats.truncnorm.rvs(
minz[min_max_unequal],
maxz[min_max_unequal],
loc=means[min_max_unequal],
scale=stds[min_max_unequal],
random_state=self.random_state
)
return ret
def undiscretize(self, data):
ret = data.copy()
for feature in self.means:
if len(data.shape) == 1:
ret[feature] = self.get_undiscretize_values(
feature, ret[feature].astype(int).reshape(-1, 1)
)
else:
ret[:, feature] = self.get_undiscretize_values(
feature, ret[:, feature].astype(int)
)
return ret
class StatsDiscretizer(BaseDiscretizer):
"""
Class to be used to supply the data stats info when discretize_continuous is true
"""
def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None,
data_stats=None):
BaseDiscretizer.__init__(self, data, categorical_features,
feature_names, labels=labels,
random_state=random_state,
data_stats=data_stats)
def bins(self, data, labels):
bins_from_stats = self.data_stats.get("bins")
bins = []
if bins_from_stats is not None:
for feature in self.to_discretize:
bins_from_stats_feature = bins_from_stats.get(feature)
if bins_from_stats_feature is not None:
qts = np.array(bins_from_stats_feature)
bins.append(qts)
return bins
class QuartileDiscretizer(BaseDiscretizer):
def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None):
BaseDiscretizer.__init__(self, data, categorical_features,
feature_names, labels=labels,
random_state=random_state)
def bins(self, data, labels):
bins = []
for feature in self.to_discretize:
qts = np.array(np.percentile(data[:, feature], [25, 50, 75]))
bins.append(qts)
return bins
class DecileDiscretizer(BaseDiscretizer):
def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None):
BaseDiscretizer.__init__(self, data, categorical_features,
feature_names, labels=labels,
random_state=random_state)
def bins(self, data, labels):
bins = []
for feature in self.to_discretize:
qts = np.array(np.percentile(data[:, feature],
[10, 20, 30, 40, 50, 60, 70, 80, 90]))
bins.append(qts)
return bins
class EntropyDiscretizer(BaseDiscretizer):
def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None):
if(labels is None):
raise ValueError('Labels must be not None when using \
EntropyDiscretizer')
BaseDiscretizer.__init__(self, data, categorical_features,
feature_names, labels=labels,
random_state=random_state)
def bins(self, data, labels):
bins = []
for feature in self.to_discretize:
# Entropy splitting / at most 8 bins so max_depth=3
dt = sklearn.tree.DecisionTreeClassifier(criterion='entropy',
max_depth=3,
random_state=self.random_state)
x = np.reshape(data[:, feature], (-1, 1))
dt.fit(x, labels)
qts = dt.tree_.threshold[np.where(dt.tree_.children_left > -1)]
if qts.shape[0] == 0:
qts = np.array([np.median(data[:, feature])])
else:
qts = np.sort(qts)
bins.append(qts)
return bins
|