|
|
|
|
|
|
|
|
|
import warnings |
|
from numbers import Integral |
|
|
|
import numpy as np |
|
|
|
from ..base import BaseEstimator, TransformerMixin, _fit_context |
|
from ..utils import resample |
|
from ..utils._param_validation import Interval, Options, StrOptions |
|
from ..utils.deprecation import _deprecate_Xt_in_inverse_transform |
|
from ..utils.stats import _weighted_percentile |
|
from ..utils.validation import ( |
|
_check_feature_names_in, |
|
_check_sample_weight, |
|
check_array, |
|
check_is_fitted, |
|
validate_data, |
|
) |
|
from ._encoders import OneHotEncoder |
|
|
|
|
|
class KBinsDiscretizer(TransformerMixin, BaseEstimator): |
|
""" |
|
Bin continuous data into intervals. |
|
|
|
Read more in the :ref:`User Guide <preprocessing_discretization>`. |
|
|
|
.. versionadded:: 0.20 |
|
|
|
Parameters |
|
---------- |
|
n_bins : int or array-like of shape (n_features,), default=5 |
|
The number of bins to produce. Raises ValueError if ``n_bins < 2``. |
|
|
|
encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot' |
|
Method used to encode the transformed result. |
|
|
|
- 'onehot': Encode the transformed result with one-hot encoding |
|
and return a sparse matrix. Ignored features are always |
|
stacked to the right. |
|
- 'onehot-dense': Encode the transformed result with one-hot encoding |
|
and return a dense array. Ignored features are always |
|
stacked to the right. |
|
- 'ordinal': Return the bin identifier encoded as an integer value. |
|
|
|
strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile' |
|
Strategy used to define the widths of the bins. |
|
|
|
- 'uniform': All bins in each feature have identical widths. |
|
- 'quantile': All bins in each feature have the same number of points. |
|
- 'kmeans': Values in each bin have the same nearest center of a 1D |
|
k-means cluster. |
|
|
|
For an example of the different strategies see: |
|
:ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_strategies.py`. |
|
|
|
dtype : {np.float32, np.float64}, default=None |
|
The desired data-type for the output. If None, output dtype is |
|
consistent with input dtype. Only np.float32 and np.float64 are |
|
supported. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
subsample : int or None, default=200_000 |
|
Maximum number of samples, used to fit the model, for computational |
|
efficiency. |
|
`subsample=None` means that all the training samples are used when |
|
computing the quantiles that determine the binning thresholds. |
|
Since quantile computation relies on sorting each column of `X` and |
|
that sorting has an `n log(n)` time complexity, |
|
it is recommended to use subsampling on datasets with a |
|
very large number of samples. |
|
|
|
.. versionchanged:: 1.3 |
|
The default value of `subsample` changed from `None` to `200_000` when |
|
`strategy="quantile"`. |
|
|
|
.. versionchanged:: 1.5 |
|
The default value of `subsample` changed from `None` to `200_000` when |
|
`strategy="uniform"` or `strategy="kmeans"`. |
|
|
|
random_state : int, RandomState instance or None, default=None |
|
Determines random number generation for subsampling. |
|
Pass an int for reproducible results across multiple function calls. |
|
See the `subsample` parameter for more details. |
|
See :term:`Glossary <random_state>`. |
|
|
|
.. versionadded:: 1.1 |
|
|
|
Attributes |
|
---------- |
|
bin_edges_ : ndarray of ndarray of shape (n_features,) |
|
The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )`` |
|
Ignored features will have empty arrays. |
|
|
|
n_bins_ : ndarray of shape (n_features,), dtype=np.int64 |
|
Number of bins per feature. Bins whose width are too small |
|
(i.e., <= 1e-8) are removed with a warning. |
|
|
|
n_features_in_ : int |
|
Number of features seen during :term:`fit`. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
feature_names_in_ : ndarray of shape (`n_features_in_`,) |
|
Names of features seen during :term:`fit`. Defined only when `X` |
|
has feature names that are all strings. |
|
|
|
.. versionadded:: 1.0 |
|
|
|
See Also |
|
-------- |
|
Binarizer : Class used to bin values as ``0`` or |
|
``1`` based on a parameter ``threshold``. |
|
|
|
Notes |
|
----- |
|
|
|
For a visualization of discretization on different datasets refer to |
|
:ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_classification.py`. |
|
On the effect of discretization on linear models see: |
|
:ref:`sphx_glr_auto_examples_preprocessing_plot_discretization.py`. |
|
|
|
In bin edges for feature ``i``, the first and last values are used only for |
|
``inverse_transform``. During transform, bin edges are extended to:: |
|
|
|
np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf]) |
|
|
|
You can combine ``KBinsDiscretizer`` with |
|
:class:`~sklearn.compose.ColumnTransformer` if you only want to preprocess |
|
part of the features. |
|
|
|
``KBinsDiscretizer`` might produce constant features (e.g., when |
|
``encode = 'onehot'`` and certain bins do not contain any data). |
|
These features can be removed with feature selection algorithms |
|
(e.g., :class:`~sklearn.feature_selection.VarianceThreshold`). |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.preprocessing import KBinsDiscretizer |
|
>>> X = [[-2, 1, -4, -1], |
|
... [-1, 2, -3, -0.5], |
|
... [ 0, 3, -2, 0.5], |
|
... [ 1, 4, -1, 2]] |
|
>>> est = KBinsDiscretizer( |
|
... n_bins=3, encode='ordinal', strategy='uniform' |
|
... ) |
|
>>> est.fit(X) |
|
KBinsDiscretizer(...) |
|
>>> Xt = est.transform(X) |
|
>>> Xt # doctest: +SKIP |
|
array([[ 0., 0., 0., 0.], |
|
[ 1., 1., 1., 0.], |
|
[ 2., 2., 2., 1.], |
|
[ 2., 2., 2., 2.]]) |
|
|
|
Sometimes it may be useful to convert the data back into the original |
|
feature space. The ``inverse_transform`` function converts the binned |
|
data into the original feature space. Each value will be equal to the mean |
|
of the two bin edges. |
|
|
|
>>> est.bin_edges_[0] |
|
array([-2., -1., 0., 1.]) |
|
>>> est.inverse_transform(Xt) |
|
array([[-1.5, 1.5, -3.5, -0.5], |
|
[-0.5, 2.5, -2.5, -0.5], |
|
[ 0.5, 3.5, -1.5, 0.5], |
|
[ 0.5, 3.5, -1.5, 1.5]]) |
|
""" |
|
|
|
_parameter_constraints: dict = { |
|
"n_bins": [Interval(Integral, 2, None, closed="left"), "array-like"], |
|
"encode": [StrOptions({"onehot", "onehot-dense", "ordinal"})], |
|
"strategy": [StrOptions({"uniform", "quantile", "kmeans"})], |
|
"dtype": [Options(type, {np.float64, np.float32}), None], |
|
"subsample": [Interval(Integral, 1, None, closed="left"), None], |
|
"random_state": ["random_state"], |
|
} |
|
|
|
def __init__( |
|
self, |
|
n_bins=5, |
|
*, |
|
encode="onehot", |
|
strategy="quantile", |
|
dtype=None, |
|
subsample=200_000, |
|
random_state=None, |
|
): |
|
self.n_bins = n_bins |
|
self.encode = encode |
|
self.strategy = strategy |
|
self.dtype = dtype |
|
self.subsample = subsample |
|
self.random_state = random_state |
|
|
|
@_fit_context(prefer_skip_nested_validation=True) |
|
def fit(self, X, y=None, sample_weight=None): |
|
""" |
|
Fit the estimator. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features) |
|
Data to be discretized. |
|
|
|
y : None |
|
Ignored. This parameter exists only for compatibility with |
|
:class:`~sklearn.pipeline.Pipeline`. |
|
|
|
sample_weight : ndarray of shape (n_samples,) |
|
Contains weight values to be associated with each sample. |
|
Cannot be used when `strategy` is set to `"uniform"`. |
|
|
|
.. versionadded:: 1.3 |
|
|
|
Returns |
|
------- |
|
self : object |
|
Returns the instance itself. |
|
""" |
|
X = validate_data(self, X, dtype="numeric") |
|
|
|
if self.dtype in (np.float64, np.float32): |
|
output_dtype = self.dtype |
|
else: |
|
output_dtype = X.dtype |
|
|
|
n_samples, n_features = X.shape |
|
|
|
if sample_weight is not None and self.strategy == "uniform": |
|
raise ValueError( |
|
"`sample_weight` was provided but it cannot be " |
|
"used with strategy='uniform'. Got strategy=" |
|
f"{self.strategy!r} instead." |
|
) |
|
|
|
if self.subsample is not None and n_samples > self.subsample: |
|
|
|
X = resample( |
|
X, |
|
replace=False, |
|
n_samples=self.subsample, |
|
random_state=self.random_state, |
|
) |
|
|
|
n_features = X.shape[1] |
|
n_bins = self._validate_n_bins(n_features) |
|
|
|
if sample_weight is not None: |
|
sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) |
|
|
|
bin_edges = np.zeros(n_features, dtype=object) |
|
for jj in range(n_features): |
|
column = X[:, jj] |
|
col_min, col_max = column.min(), column.max() |
|
|
|
if col_min == col_max: |
|
warnings.warn( |
|
"Feature %d is constant and will be replaced with 0." % jj |
|
) |
|
n_bins[jj] = 1 |
|
bin_edges[jj] = np.array([-np.inf, np.inf]) |
|
continue |
|
|
|
if self.strategy == "uniform": |
|
bin_edges[jj] = np.linspace(col_min, col_max, n_bins[jj] + 1) |
|
|
|
elif self.strategy == "quantile": |
|
quantiles = np.linspace(0, 100, n_bins[jj] + 1) |
|
if sample_weight is None: |
|
bin_edges[jj] = np.asarray(np.percentile(column, quantiles)) |
|
else: |
|
bin_edges[jj] = np.asarray( |
|
[ |
|
_weighted_percentile(column, sample_weight, q) |
|
for q in quantiles |
|
], |
|
dtype=np.float64, |
|
) |
|
elif self.strategy == "kmeans": |
|
from ..cluster import KMeans |
|
|
|
|
|
uniform_edges = np.linspace(col_min, col_max, n_bins[jj] + 1) |
|
init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5 |
|
|
|
|
|
km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1) |
|
centers = km.fit( |
|
column[:, None], sample_weight=sample_weight |
|
).cluster_centers_[:, 0] |
|
|
|
centers.sort() |
|
bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5 |
|
bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max] |
|
|
|
|
|
if self.strategy in ("quantile", "kmeans"): |
|
mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8 |
|
bin_edges[jj] = bin_edges[jj][mask] |
|
if len(bin_edges[jj]) - 1 != n_bins[jj]: |
|
warnings.warn( |
|
"Bins whose width are too small (i.e., <= " |
|
"1e-8) in feature %d are removed. Consider " |
|
"decreasing the number of bins." % jj |
|
) |
|
n_bins[jj] = len(bin_edges[jj]) - 1 |
|
|
|
self.bin_edges_ = bin_edges |
|
self.n_bins_ = n_bins |
|
|
|
if "onehot" in self.encode: |
|
self._encoder = OneHotEncoder( |
|
categories=[np.arange(i) for i in self.n_bins_], |
|
sparse_output=self.encode == "onehot", |
|
dtype=output_dtype, |
|
) |
|
|
|
|
|
self._encoder.fit(np.zeros((1, len(self.n_bins_)))) |
|
|
|
return self |
|
|
|
def _validate_n_bins(self, n_features): |
|
"""Returns n_bins_, the number of bins per feature.""" |
|
orig_bins = self.n_bins |
|
if isinstance(orig_bins, Integral): |
|
return np.full(n_features, orig_bins, dtype=int) |
|
|
|
n_bins = check_array(orig_bins, dtype=int, copy=True, ensure_2d=False) |
|
|
|
if n_bins.ndim > 1 or n_bins.shape[0] != n_features: |
|
raise ValueError("n_bins must be a scalar or array of shape (n_features,).") |
|
|
|
bad_nbins_value = (n_bins < 2) | (n_bins != orig_bins) |
|
|
|
violating_indices = np.where(bad_nbins_value)[0] |
|
if violating_indices.shape[0] > 0: |
|
indices = ", ".join(str(i) for i in violating_indices) |
|
raise ValueError( |
|
"{} received an invalid number " |
|
"of bins at indices {}. Number of bins " |
|
"must be at least 2, and must be an int.".format( |
|
KBinsDiscretizer.__name__, indices |
|
) |
|
) |
|
return n_bins |
|
|
|
def transform(self, X): |
|
""" |
|
Discretize the data. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features) |
|
Data to be discretized. |
|
|
|
Returns |
|
------- |
|
Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64} |
|
Data in the binned space. Will be a sparse matrix if |
|
`self.encode='onehot'` and ndarray otherwise. |
|
""" |
|
check_is_fitted(self) |
|
|
|
|
|
dtype = (np.float64, np.float32) if self.dtype is None else self.dtype |
|
Xt = validate_data(self, X, copy=True, dtype=dtype, reset=False) |
|
|
|
bin_edges = self.bin_edges_ |
|
for jj in range(Xt.shape[1]): |
|
Xt[:, jj] = np.searchsorted(bin_edges[jj][1:-1], Xt[:, jj], side="right") |
|
|
|
if self.encode == "ordinal": |
|
return Xt |
|
|
|
dtype_init = None |
|
if "onehot" in self.encode: |
|
dtype_init = self._encoder.dtype |
|
self._encoder.dtype = Xt.dtype |
|
try: |
|
Xt_enc = self._encoder.transform(Xt) |
|
finally: |
|
|
|
self._encoder.dtype = dtype_init |
|
return Xt_enc |
|
|
|
def inverse_transform(self, X=None, *, Xt=None): |
|
""" |
|
Transform discretized data back to original feature space. |
|
|
|
Note that this function does not regenerate the original data |
|
due to discretization rounding. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features) |
|
Transformed data in the binned space. |
|
|
|
Xt : array-like of shape (n_samples, n_features) |
|
Transformed data in the binned space. |
|
|
|
.. deprecated:: 1.5 |
|
`Xt` was deprecated in 1.5 and will be removed in 1.7. Use `X` instead. |
|
|
|
Returns |
|
------- |
|
Xinv : ndarray, dtype={np.float32, np.float64} |
|
Data in the original feature space. |
|
""" |
|
X = _deprecate_Xt_in_inverse_transform(X, Xt) |
|
|
|
check_is_fitted(self) |
|
|
|
if "onehot" in self.encode: |
|
X = self._encoder.inverse_transform(X) |
|
|
|
Xinv = check_array(X, copy=True, dtype=(np.float64, np.float32)) |
|
n_features = self.n_bins_.shape[0] |
|
if Xinv.shape[1] != n_features: |
|
raise ValueError( |
|
"Incorrect number of features. Expecting {}, received {}.".format( |
|
n_features, Xinv.shape[1] |
|
) |
|
) |
|
|
|
for jj in range(n_features): |
|
bin_edges = self.bin_edges_[jj] |
|
bin_centers = (bin_edges[1:] + bin_edges[:-1]) * 0.5 |
|
Xinv[:, jj] = bin_centers[(Xinv[:, jj]).astype(np.int64)] |
|
|
|
return Xinv |
|
|
|
def get_feature_names_out(self, input_features=None): |
|
"""Get output feature names. |
|
|
|
Parameters |
|
---------- |
|
input_features : array-like of str or None, default=None |
|
Input features. |
|
|
|
- If `input_features` is `None`, then `feature_names_in_` is |
|
used as feature names in. If `feature_names_in_` is not defined, |
|
then the following input feature names are generated: |
|
`["x0", "x1", ..., "x(n_features_in_ - 1)"]`. |
|
- If `input_features` is an array-like, then `input_features` must |
|
match `feature_names_in_` if `feature_names_in_` is defined. |
|
|
|
Returns |
|
------- |
|
feature_names_out : ndarray of str objects |
|
Transformed feature names. |
|
""" |
|
check_is_fitted(self, "n_features_in_") |
|
input_features = _check_feature_names_in(self, input_features) |
|
if hasattr(self, "_encoder"): |
|
return self._encoder.get_feature_names_out(input_features) |
|
|
|
|
|
return input_features |
|
|