|
|
|
|
|
|
|
import numbers |
|
import warnings |
|
from collections import Counter |
|
from functools import partial |
|
from typing import Callable |
|
|
|
import numpy as np |
|
import numpy.ma as ma |
|
from scipy import sparse as sp |
|
|
|
from ..base import BaseEstimator, TransformerMixin, _fit_context |
|
from ..utils._mask import _get_mask |
|
from ..utils._missing import is_pandas_na, is_scalar_nan |
|
from ..utils._param_validation import MissingValues, StrOptions |
|
from ..utils.fixes import _mode |
|
from ..utils.sparsefuncs import _get_median |
|
from ..utils.validation import ( |
|
FLOAT_DTYPES, |
|
_check_feature_names_in, |
|
_check_n_features, |
|
check_is_fitted, |
|
validate_data, |
|
) |
|
|
|
|
|
def _check_inputs_dtype(X, missing_values): |
|
if is_pandas_na(missing_values): |
|
|
|
return |
|
if X.dtype.kind in ("f", "i", "u") and not isinstance(missing_values, numbers.Real): |
|
raise ValueError( |
|
"'X' and 'missing_values' types are expected to be" |
|
" both numerical. Got X.dtype={} and " |
|
" type(missing_values)={}.".format(X.dtype, type(missing_values)) |
|
) |
|
|
|
|
|
def _most_frequent(array, extra_value, n_repeat): |
|
"""Compute the most frequent value in a 1d array extended with |
|
[extra_value] * n_repeat, where extra_value is assumed to be not part |
|
of the array.""" |
|
|
|
if array.size > 0: |
|
if array.dtype == object: |
|
|
|
|
|
counter = Counter(array) |
|
most_frequent_count = counter.most_common(1)[0][1] |
|
|
|
most_frequent_value = min( |
|
value |
|
for value, count in counter.items() |
|
if count == most_frequent_count |
|
) |
|
else: |
|
mode = _mode(array) |
|
most_frequent_value = mode[0][0] |
|
most_frequent_count = mode[1][0] |
|
else: |
|
most_frequent_value = 0 |
|
most_frequent_count = 0 |
|
|
|
|
|
if most_frequent_count == 0 and n_repeat == 0: |
|
return np.nan |
|
elif most_frequent_count < n_repeat: |
|
return extra_value |
|
elif most_frequent_count > n_repeat: |
|
return most_frequent_value |
|
elif most_frequent_count == n_repeat: |
|
|
|
return min(most_frequent_value, extra_value) |
|
|
|
|
|
class _BaseImputer(TransformerMixin, BaseEstimator): |
|
"""Base class for all imputers. |
|
|
|
It adds automatically support for `add_indicator`. |
|
""" |
|
|
|
_parameter_constraints: dict = { |
|
"missing_values": [MissingValues()], |
|
"add_indicator": ["boolean"], |
|
"keep_empty_features": ["boolean"], |
|
} |
|
|
|
def __init__( |
|
self, *, missing_values=np.nan, add_indicator=False, keep_empty_features=False |
|
): |
|
self.missing_values = missing_values |
|
self.add_indicator = add_indicator |
|
self.keep_empty_features = keep_empty_features |
|
|
|
def _fit_indicator(self, X): |
|
"""Fit a MissingIndicator.""" |
|
if self.add_indicator: |
|
self.indicator_ = MissingIndicator( |
|
missing_values=self.missing_values, error_on_new=False |
|
) |
|
self.indicator_._fit(X, precomputed=True) |
|
else: |
|
self.indicator_ = None |
|
|
|
def _transform_indicator(self, X): |
|
"""Compute the indicator mask.' |
|
|
|
Note that X must be the original data as passed to the imputer before |
|
any imputation, since imputation may be done inplace in some cases. |
|
""" |
|
if self.add_indicator: |
|
if not hasattr(self, "indicator_"): |
|
raise ValueError( |
|
"Make sure to call _fit_indicator before _transform_indicator" |
|
) |
|
return self.indicator_.transform(X) |
|
|
|
def _concatenate_indicator(self, X_imputed, X_indicator): |
|
"""Concatenate indicator mask with the imputed data.""" |
|
if not self.add_indicator: |
|
return X_imputed |
|
|
|
if sp.issparse(X_imputed): |
|
|
|
|
|
hstack = partial(sp.hstack, format=X_imputed.format) |
|
else: |
|
hstack = np.hstack |
|
|
|
if X_indicator is None: |
|
raise ValueError( |
|
"Data from the missing indicator are not provided. Call " |
|
"_fit_indicator and _transform_indicator in the imputer " |
|
"implementation." |
|
) |
|
|
|
return hstack((X_imputed, X_indicator)) |
|
|
|
def _concatenate_indicator_feature_names_out(self, names, input_features): |
|
if not self.add_indicator: |
|
return names |
|
|
|
indicator_names = self.indicator_.get_feature_names_out(input_features) |
|
return np.concatenate([names, indicator_names]) |
|
|
|
def __sklearn_tags__(self): |
|
tags = super().__sklearn_tags__() |
|
tags.input_tags.allow_nan = is_scalar_nan(self.missing_values) |
|
return tags |
|
|
|
|
|
class SimpleImputer(_BaseImputer): |
|
"""Univariate imputer for completing missing values with simple strategies. |
|
|
|
Replace missing values using a descriptive statistic (e.g. mean, median, or |
|
most frequent) along each column, or using a constant value. |
|
|
|
Read more in the :ref:`User Guide <impute>`. |
|
|
|
.. versionadded:: 0.20 |
|
`SimpleImputer` replaces the previous `sklearn.preprocessing.Imputer` |
|
estimator which is now removed. |
|
|
|
Parameters |
|
---------- |
|
missing_values : int, float, str, np.nan, None or pandas.NA, default=np.nan |
|
The placeholder for the missing values. All occurrences of |
|
`missing_values` will be imputed. For pandas' dataframes with |
|
nullable integer dtypes with missing values, `missing_values` |
|
can be set to either `np.nan` or `pd.NA`. |
|
|
|
strategy : str or Callable, default='mean' |
|
The imputation strategy. |
|
|
|
- If "mean", then replace missing values using the mean along |
|
each column. Can only be used with numeric data. |
|
- If "median", then replace missing values using the median along |
|
each column. Can only be used with numeric data. |
|
- If "most_frequent", then replace missing using the most frequent |
|
value along each column. Can be used with strings or numeric data. |
|
If there is more than one such value, only the smallest is returned. |
|
- If "constant", then replace missing values with fill_value. Can be |
|
used with strings or numeric data. |
|
- If an instance of Callable, then replace missing values using the |
|
scalar statistic returned by running the callable over a dense 1d |
|
array containing non-missing values of each column. |
|
|
|
.. versionadded:: 0.20 |
|
strategy="constant" for fixed value imputation. |
|
|
|
.. versionadded:: 1.5 |
|
strategy=callable for custom value imputation. |
|
|
|
fill_value : str or numerical value, default=None |
|
When strategy == "constant", `fill_value` is used to replace all |
|
occurrences of missing_values. For string or object data types, |
|
`fill_value` must be a string. |
|
If `None`, `fill_value` will be 0 when imputing numerical |
|
data and "missing_value" for strings or object data types. |
|
|
|
copy : bool, default=True |
|
If True, a copy of X will be created. If False, imputation will |
|
be done in-place whenever possible. Note that, in the following cases, |
|
a new copy will always be made, even if `copy=False`: |
|
|
|
- If `X` is not an array of floating values; |
|
- If `X` is encoded as a CSR matrix; |
|
- If `add_indicator=True`. |
|
|
|
add_indicator : bool, default=False |
|
If True, a :class:`MissingIndicator` transform will stack onto output |
|
of the imputer's transform. This allows a predictive estimator |
|
to account for missingness despite imputation. If a feature has no |
|
missing values at fit/train time, the feature won't appear on |
|
the missing indicator even if there are missing values at |
|
transform/test time. |
|
|
|
keep_empty_features : bool, default=False |
|
If True, features that consist exclusively of missing values when |
|
`fit` is called are returned in results when `transform` is called. |
|
The imputed value is always `0` except when `strategy="constant"` |
|
in which case `fill_value` will be used instead. |
|
|
|
.. versionadded:: 1.2 |
|
|
|
.. versionchanged:: 1.6 |
|
Currently, when `keep_empty_feature=False` and `strategy="constant"`, |
|
empty features are not dropped. This behaviour will change in version |
|
1.8. Set `keep_empty_feature=True` to preserve this behaviour. |
|
|
|
Attributes |
|
---------- |
|
statistics_ : array of shape (n_features,) |
|
The imputation fill value for each feature. |
|
Computing statistics can result in `np.nan` values. |
|
During :meth:`transform`, features corresponding to `np.nan` |
|
statistics will be discarded. |
|
|
|
indicator_ : :class:`~sklearn.impute.MissingIndicator` |
|
Indicator used to add binary indicators for missing values. |
|
`None` if `add_indicator=False`. |
|
|
|
n_features_in_ : int |
|
Number of features seen during :term:`fit`. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
feature_names_in_ : ndarray of shape (`n_features_in_`,) |
|
Names of features seen during :term:`fit`. Defined only when `X` |
|
has feature names that are all strings. |
|
|
|
.. versionadded:: 1.0 |
|
|
|
See Also |
|
-------- |
|
IterativeImputer : Multivariate imputer that estimates values to impute for |
|
each feature with missing values from all the others. |
|
KNNImputer : Multivariate imputer that estimates missing features using |
|
nearest samples. |
|
|
|
Notes |
|
----- |
|
Columns which only contained missing values at :meth:`fit` are discarded |
|
upon :meth:`transform` if strategy is not `"constant"`. |
|
|
|
In a prediction context, simple imputation usually performs poorly when |
|
associated with a weak learner. However, with a powerful learner, it can |
|
lead to as good or better performance than complex imputation such as |
|
:class:`~sklearn.impute.IterativeImputer` or :class:`~sklearn.impute.KNNImputer`. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.impute import SimpleImputer |
|
>>> imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') |
|
>>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]]) |
|
SimpleImputer() |
|
>>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]] |
|
>>> print(imp_mean.transform(X)) |
|
[[ 7. 2. 3. ] |
|
[ 4. 3.5 6. ] |
|
[10. 3.5 9. ]] |
|
|
|
For a more detailed example see |
|
:ref:`sphx_glr_auto_examples_impute_plot_missing_values.py`. |
|
""" |
|
|
|
_parameter_constraints: dict = { |
|
**_BaseImputer._parameter_constraints, |
|
"strategy": [ |
|
StrOptions({"mean", "median", "most_frequent", "constant"}), |
|
callable, |
|
], |
|
"fill_value": "no_validation", |
|
"copy": ["boolean"], |
|
} |
|
|
|
def __init__( |
|
self, |
|
*, |
|
missing_values=np.nan, |
|
strategy="mean", |
|
fill_value=None, |
|
copy=True, |
|
add_indicator=False, |
|
keep_empty_features=False, |
|
): |
|
super().__init__( |
|
missing_values=missing_values, |
|
add_indicator=add_indicator, |
|
keep_empty_features=keep_empty_features, |
|
) |
|
self.strategy = strategy |
|
self.fill_value = fill_value |
|
self.copy = copy |
|
|
|
def _validate_input(self, X, in_fit): |
|
if self.strategy in ("most_frequent", "constant"): |
|
|
|
|
|
|
|
|
|
if isinstance(X, list) and any( |
|
isinstance(elem, str) for row in X for elem in row |
|
): |
|
dtype = object |
|
else: |
|
dtype = None |
|
else: |
|
dtype = FLOAT_DTYPES |
|
|
|
if not in_fit and self._fit_dtype.kind == "O": |
|
|
|
dtype = self._fit_dtype |
|
|
|
if is_pandas_na(self.missing_values) or is_scalar_nan(self.missing_values): |
|
ensure_all_finite = "allow-nan" |
|
else: |
|
ensure_all_finite = True |
|
|
|
try: |
|
X = validate_data( |
|
self, |
|
X, |
|
reset=in_fit, |
|
accept_sparse="csc", |
|
dtype=dtype, |
|
force_writeable=True if not in_fit else None, |
|
ensure_all_finite=ensure_all_finite, |
|
copy=self.copy, |
|
) |
|
except ValueError as ve: |
|
if "could not convert" in str(ve): |
|
new_ve = ValueError( |
|
"Cannot use {} strategy with non-numeric data:\n{}".format( |
|
self.strategy, ve |
|
) |
|
) |
|
raise new_ve from None |
|
else: |
|
raise ve |
|
|
|
if in_fit: |
|
|
|
self._fit_dtype = X.dtype |
|
|
|
_check_inputs_dtype(X, self.missing_values) |
|
if X.dtype.kind not in ("i", "u", "f", "O"): |
|
raise ValueError( |
|
"SimpleImputer does not support data with dtype " |
|
"{0}. Please provide either a numeric array (with" |
|
" a floating point or integer dtype) or " |
|
"categorical data represented either as an array " |
|
"with integer dtype or an array of string values " |
|
"with an object dtype.".format(X.dtype) |
|
) |
|
|
|
if sp.issparse(X) and self.missing_values == 0: |
|
|
|
|
|
raise ValueError( |
|
"Imputation not possible when missing_values " |
|
"== 0 and input is sparse. Provide a dense " |
|
"array instead." |
|
) |
|
|
|
if self.strategy == "constant": |
|
if in_fit and self.fill_value is not None: |
|
fill_value_dtype = type(self.fill_value) |
|
err_msg = ( |
|
f"fill_value={self.fill_value!r} (of type {fill_value_dtype!r}) " |
|
f"cannot be cast to the input data that is {X.dtype!r}. Make sure " |
|
"that both dtypes are of the same kind." |
|
) |
|
elif not in_fit: |
|
fill_value_dtype = self.statistics_.dtype |
|
err_msg = ( |
|
f"The dtype of the filling value (i.e. {fill_value_dtype!r}) " |
|
f"cannot be cast to the input data that is {X.dtype!r}. Make sure " |
|
"that the dtypes of the input data is of the same kind between " |
|
"fit and transform." |
|
) |
|
else: |
|
|
|
|
|
fill_value_dtype = X.dtype |
|
|
|
|
|
if not np.can_cast(fill_value_dtype, X.dtype, casting="same_kind"): |
|
raise ValueError(err_msg) |
|
|
|
return X |
|
|
|
@_fit_context(prefer_skip_nested_validation=True) |
|
def fit(self, X, y=None): |
|
"""Fit the imputer on `X`. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix}, shape (n_samples, n_features) |
|
Input data, where `n_samples` is the number of samples and |
|
`n_features` is the number of features. |
|
|
|
y : Ignored |
|
Not used, present here for API consistency by convention. |
|
|
|
Returns |
|
------- |
|
self : object |
|
Fitted estimator. |
|
""" |
|
X = self._validate_input(X, in_fit=True) |
|
|
|
|
|
|
|
if self.fill_value is None: |
|
if X.dtype.kind in ("i", "u", "f"): |
|
fill_value = 0 |
|
else: |
|
fill_value = "missing_value" |
|
else: |
|
fill_value = self.fill_value |
|
|
|
if sp.issparse(X): |
|
self.statistics_ = self._sparse_fit( |
|
X, self.strategy, self.missing_values, fill_value |
|
) |
|
else: |
|
self.statistics_ = self._dense_fit( |
|
X, self.strategy, self.missing_values, fill_value |
|
) |
|
|
|
return self |
|
|
|
def _sparse_fit(self, X, strategy, missing_values, fill_value): |
|
"""Fit the transformer on sparse data.""" |
|
missing_mask = _get_mask(X, missing_values) |
|
mask_data = missing_mask.data |
|
n_implicit_zeros = X.shape[0] - np.diff(X.indptr) |
|
|
|
statistics = np.empty(X.shape[1]) |
|
|
|
if strategy == "constant": |
|
|
|
|
|
if not self.keep_empty_features and any( |
|
[all(missing_mask[:, i].data) for i in range(missing_mask.shape[1])] |
|
): |
|
warnings.warn( |
|
"Currently, when `keep_empty_feature=False` and " |
|
'`strategy="constant"`, empty features are not dropped. ' |
|
"This behaviour will change in version 1.8. Set " |
|
"`keep_empty_feature=True` to preserve this behaviour.", |
|
FutureWarning, |
|
) |
|
|
|
|
|
|
|
statistics.fill(fill_value) |
|
else: |
|
for i in range(X.shape[1]): |
|
column = X.data[X.indptr[i] : X.indptr[i + 1]] |
|
mask_column = mask_data[X.indptr[i] : X.indptr[i + 1]] |
|
column = column[~mask_column] |
|
|
|
|
|
mask_zeros = _get_mask(column, 0) |
|
column = column[~mask_zeros] |
|
n_explicit_zeros = mask_zeros.sum() |
|
n_zeros = n_implicit_zeros[i] + n_explicit_zeros |
|
|
|
if len(column) == 0 and self.keep_empty_features: |
|
|
|
statistics[i] = 0 |
|
else: |
|
if strategy == "mean": |
|
s = column.size + n_zeros |
|
statistics[i] = np.nan if s == 0 else column.sum() / s |
|
|
|
elif strategy == "median": |
|
statistics[i] = _get_median(column, n_zeros) |
|
|
|
elif strategy == "most_frequent": |
|
statistics[i] = _most_frequent(column, 0, n_zeros) |
|
|
|
elif isinstance(strategy, Callable): |
|
statistics[i] = self.strategy(column) |
|
|
|
super()._fit_indicator(missing_mask) |
|
|
|
return statistics |
|
|
|
def _dense_fit(self, X, strategy, missing_values, fill_value): |
|
"""Fit the transformer on dense data.""" |
|
missing_mask = _get_mask(X, missing_values) |
|
masked_X = ma.masked_array(X, mask=missing_mask) |
|
|
|
super()._fit_indicator(missing_mask) |
|
|
|
|
|
if strategy == "mean": |
|
mean_masked = np.ma.mean(masked_X, axis=0) |
|
|
|
mean = np.ma.getdata(mean_masked) |
|
mean[np.ma.getmask(mean_masked)] = 0 if self.keep_empty_features else np.nan |
|
|
|
return mean |
|
|
|
|
|
elif strategy == "median": |
|
median_masked = np.ma.median(masked_X, axis=0) |
|
|
|
median = np.ma.getdata(median_masked) |
|
median[np.ma.getmaskarray(median_masked)] = ( |
|
0 if self.keep_empty_features else np.nan |
|
) |
|
|
|
return median |
|
|
|
|
|
elif strategy == "most_frequent": |
|
|
|
|
|
|
|
|
|
|
|
X = X.transpose() |
|
mask = missing_mask.transpose() |
|
|
|
if X.dtype.kind == "O": |
|
most_frequent = np.empty(X.shape[0], dtype=object) |
|
else: |
|
most_frequent = np.empty(X.shape[0]) |
|
|
|
for i, (row, row_mask) in enumerate(zip(X[:], mask[:])): |
|
row_mask = np.logical_not(row_mask).astype(bool) |
|
row = row[row_mask] |
|
if len(row) == 0 and self.keep_empty_features: |
|
most_frequent[i] = 0 |
|
else: |
|
most_frequent[i] = _most_frequent(row, np.nan, 0) |
|
|
|
return most_frequent |
|
|
|
|
|
elif strategy == "constant": |
|
|
|
|
|
if not self.keep_empty_features and ma.getmask(masked_X).all(axis=0).any(): |
|
warnings.warn( |
|
"Currently, when `keep_empty_feature=False` and " |
|
'`strategy="constant"`, empty features are not dropped. ' |
|
"This behaviour will change in version 1.8. Set " |
|
"`keep_empty_feature=True` to preserve this behaviour.", |
|
FutureWarning, |
|
) |
|
|
|
|
|
|
|
return np.full(X.shape[1], fill_value, dtype=X.dtype) |
|
|
|
|
|
elif isinstance(strategy, Callable): |
|
statistics = np.empty(masked_X.shape[1]) |
|
for i in range(masked_X.shape[1]): |
|
statistics[i] = self.strategy(masked_X[:, i].compressed()) |
|
return statistics |
|
|
|
def transform(self, X): |
|
"""Impute all missing values in `X`. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix}, shape (n_samples, n_features) |
|
The input data to complete. |
|
|
|
Returns |
|
------- |
|
X_imputed : {ndarray, sparse matrix} of shape \ |
|
(n_samples, n_features_out) |
|
`X` with imputed values. |
|
""" |
|
check_is_fitted(self) |
|
|
|
X = self._validate_input(X, in_fit=False) |
|
statistics = self.statistics_ |
|
|
|
if X.shape[1] != statistics.shape[0]: |
|
raise ValueError( |
|
"X has %d features per sample, expected %d" |
|
% (X.shape[1], self.statistics_.shape[0]) |
|
) |
|
|
|
|
|
missing_mask = _get_mask(X, self.missing_values) |
|
|
|
|
|
if self.strategy == "constant" or self.keep_empty_features: |
|
valid_statistics = statistics |
|
valid_statistics_indexes = None |
|
else: |
|
|
|
invalid_mask = _get_mask(statistics, np.nan) |
|
valid_mask = np.logical_not(invalid_mask) |
|
valid_statistics = statistics[valid_mask] |
|
valid_statistics_indexes = np.flatnonzero(valid_mask) |
|
|
|
if invalid_mask.any(): |
|
invalid_features = np.arange(X.shape[1])[invalid_mask] |
|
|
|
if hasattr(self, "feature_names_in_"): |
|
invalid_features = self.feature_names_in_[invalid_features] |
|
warnings.warn( |
|
"Skipping features without any observed values:" |
|
f" {invalid_features}. At least one non-missing value is needed" |
|
f" for imputation with strategy='{self.strategy}'." |
|
) |
|
X = X[:, valid_statistics_indexes] |
|
|
|
|
|
if sp.issparse(X): |
|
if self.missing_values == 0: |
|
raise ValueError( |
|
"Imputation not possible when missing_values " |
|
"== 0 and input is sparse. Provide a dense " |
|
"array instead." |
|
) |
|
else: |
|
|
|
|
|
if valid_statistics_indexes is None: |
|
mask = missing_mask.data |
|
else: |
|
mask = _get_mask(X.data, self.missing_values) |
|
indexes = np.repeat( |
|
np.arange(len(X.indptr) - 1, dtype=int), np.diff(X.indptr) |
|
)[mask] |
|
|
|
X.data[mask] = valid_statistics[indexes].astype(X.dtype, copy=False) |
|
else: |
|
|
|
if valid_statistics_indexes is None: |
|
mask_valid_features = missing_mask |
|
else: |
|
mask_valid_features = missing_mask[:, valid_statistics_indexes] |
|
n_missing = np.sum(mask_valid_features, axis=0) |
|
values = np.repeat(valid_statistics, n_missing) |
|
coordinates = np.where(mask_valid_features.transpose())[::-1] |
|
|
|
X[coordinates] = values |
|
|
|
X_indicator = super()._transform_indicator(missing_mask) |
|
|
|
return super()._concatenate_indicator(X, X_indicator) |
|
|
|
def inverse_transform(self, X): |
|
"""Convert the data back to the original representation. |
|
|
|
Inverts the `transform` operation performed on an array. |
|
This operation can only be performed after :class:`SimpleImputer` is |
|
instantiated with `add_indicator=True`. |
|
|
|
Note that `inverse_transform` can only invert the transform in |
|
features that have binary indicators for missing values. If a feature |
|
has no missing values at `fit` time, the feature won't have a binary |
|
indicator, and the imputation done at `transform` time won't be |
|
inverted. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape \ |
|
(n_samples, n_features + n_features_missing_indicator) |
|
The imputed data to be reverted to original data. It has to be |
|
an augmented array of imputed data and the missing indicator mask. |
|
|
|
Returns |
|
------- |
|
X_original : ndarray of shape (n_samples, n_features) |
|
The original `X` with missing values as it was prior |
|
to imputation. |
|
""" |
|
check_is_fitted(self) |
|
|
|
if not self.add_indicator: |
|
raise ValueError( |
|
"'inverse_transform' works only when " |
|
"'SimpleImputer' is instantiated with " |
|
"'add_indicator=True'. " |
|
f"Got 'add_indicator={self.add_indicator}' " |
|
"instead." |
|
) |
|
|
|
n_features_missing = len(self.indicator_.features_) |
|
non_empty_feature_count = X.shape[1] - n_features_missing |
|
array_imputed = X[:, :non_empty_feature_count].copy() |
|
missing_mask = X[:, non_empty_feature_count:].astype(bool) |
|
|
|
n_features_original = len(self.statistics_) |
|
shape_original = (X.shape[0], n_features_original) |
|
X_original = np.zeros(shape_original) |
|
X_original[:, self.indicator_.features_] = missing_mask |
|
full_mask = X_original.astype(bool) |
|
|
|
imputed_idx, original_idx = 0, 0 |
|
while imputed_idx < len(array_imputed.T): |
|
if not np.all(X_original[:, original_idx]): |
|
X_original[:, original_idx] = array_imputed.T[imputed_idx] |
|
imputed_idx += 1 |
|
original_idx += 1 |
|
else: |
|
original_idx += 1 |
|
|
|
X_original[full_mask] = self.missing_values |
|
return X_original |
|
|
|
def __sklearn_tags__(self): |
|
tags = super().__sklearn_tags__() |
|
tags.input_tags.sparse = True |
|
tags.input_tags.allow_nan = is_pandas_na(self.missing_values) or is_scalar_nan( |
|
self.missing_values |
|
) |
|
return tags |
|
|
|
def get_feature_names_out(self, input_features=None): |
|
"""Get output feature names for transformation. |
|
|
|
Parameters |
|
---------- |
|
input_features : array-like of str or None, default=None |
|
Input features. |
|
|
|
- If `input_features` is `None`, then `feature_names_in_` is |
|
used as feature names in. If `feature_names_in_` is not defined, |
|
then the following input feature names are generated: |
|
`["x0", "x1", ..., "x(n_features_in_ - 1)"]`. |
|
- If `input_features` is an array-like, then `input_features` must |
|
match `feature_names_in_` if `feature_names_in_` is defined. |
|
|
|
Returns |
|
------- |
|
feature_names_out : ndarray of str objects |
|
Transformed feature names. |
|
""" |
|
check_is_fitted(self, "n_features_in_") |
|
input_features = _check_feature_names_in(self, input_features) |
|
non_missing_mask = np.logical_not(_get_mask(self.statistics_, np.nan)) |
|
names = input_features[non_missing_mask] |
|
return self._concatenate_indicator_feature_names_out(names, input_features) |
|
|
|
|
|
class MissingIndicator(TransformerMixin, BaseEstimator): |
|
"""Binary indicators for missing values. |
|
|
|
Note that this component typically should not be used in a vanilla |
|
:class:`~sklearn.pipeline.Pipeline` consisting of transformers and a |
|
classifier, but rather could be added using a |
|
:class:`~sklearn.pipeline.FeatureUnion` or |
|
:class:`~sklearn.compose.ColumnTransformer`. |
|
|
|
Read more in the :ref:`User Guide <impute>`. |
|
|
|
.. versionadded:: 0.20 |
|
|
|
Parameters |
|
---------- |
|
missing_values : int, float, str, np.nan or None, default=np.nan |
|
The placeholder for the missing values. All occurrences of |
|
`missing_values` will be imputed. For pandas' dataframes with |
|
nullable integer dtypes with missing values, `missing_values` |
|
should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`. |
|
|
|
features : {'missing-only', 'all'}, default='missing-only' |
|
Whether the imputer mask should represent all or a subset of |
|
features. |
|
|
|
- If `'missing-only'` (default), the imputer mask will only represent |
|
features containing missing values during fit time. |
|
- If `'all'`, the imputer mask will represent all features. |
|
|
|
sparse : bool or 'auto', default='auto' |
|
Whether the imputer mask format should be sparse or dense. |
|
|
|
- If `'auto'` (default), the imputer mask will be of same type as |
|
input. |
|
- If `True`, the imputer mask will be a sparse matrix. |
|
- If `False`, the imputer mask will be a numpy array. |
|
|
|
error_on_new : bool, default=True |
|
If `True`, :meth:`transform` will raise an error when there are |
|
features with missing values that have no missing values in |
|
:meth:`fit`. This is applicable only when `features='missing-only'`. |
|
|
|
Attributes |
|
---------- |
|
features_ : ndarray of shape (n_missing_features,) or (n_features,) |
|
The features indices which will be returned when calling |
|
:meth:`transform`. They are computed during :meth:`fit`. If |
|
`features='all'`, `features_` is equal to `range(n_features)`. |
|
|
|
n_features_in_ : int |
|
Number of features seen during :term:`fit`. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
feature_names_in_ : ndarray of shape (`n_features_in_`,) |
|
Names of features seen during :term:`fit`. Defined only when `X` |
|
has feature names that are all strings. |
|
|
|
.. versionadded:: 1.0 |
|
|
|
See Also |
|
-------- |
|
SimpleImputer : Univariate imputation of missing values. |
|
IterativeImputer : Multivariate imputation of missing values. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.impute import MissingIndicator |
|
>>> X1 = np.array([[np.nan, 1, 3], |
|
... [4, 0, np.nan], |
|
... [8, 1, 0]]) |
|
>>> X2 = np.array([[5, 1, np.nan], |
|
... [np.nan, 2, 3], |
|
... [2, 4, 0]]) |
|
>>> indicator = MissingIndicator() |
|
>>> indicator.fit(X1) |
|
MissingIndicator() |
|
>>> X2_tr = indicator.transform(X2) |
|
>>> X2_tr |
|
array([[False, True], |
|
[ True, False], |
|
[False, False]]) |
|
""" |
|
|
|
_parameter_constraints: dict = { |
|
"missing_values": [MissingValues()], |
|
"features": [StrOptions({"missing-only", "all"})], |
|
"sparse": ["boolean", StrOptions({"auto"})], |
|
"error_on_new": ["boolean"], |
|
} |
|
|
|
def __init__( |
|
self, |
|
*, |
|
missing_values=np.nan, |
|
features="missing-only", |
|
sparse="auto", |
|
error_on_new=True, |
|
): |
|
self.missing_values = missing_values |
|
self.features = features |
|
self.sparse = sparse |
|
self.error_on_new = error_on_new |
|
|
|
def _get_missing_features_info(self, X): |
|
"""Compute the imputer mask and the indices of the features |
|
containing missing values. |
|
|
|
Parameters |
|
---------- |
|
X : {ndarray, sparse matrix} of shape (n_samples, n_features) |
|
The input data with missing values. Note that `X` has been |
|
checked in :meth:`fit` and :meth:`transform` before to call this |
|
function. |
|
|
|
Returns |
|
------- |
|
imputer_mask : {ndarray, sparse matrix} of shape \ |
|
(n_samples, n_features) |
|
The imputer mask of the original data. |
|
|
|
features_with_missing : ndarray of shape (n_features_with_missing) |
|
The features containing missing values. |
|
""" |
|
if not self._precomputed: |
|
imputer_mask = _get_mask(X, self.missing_values) |
|
else: |
|
imputer_mask = X |
|
|
|
if sp.issparse(X): |
|
imputer_mask.eliminate_zeros() |
|
|
|
if self.features == "missing-only": |
|
n_missing = imputer_mask.getnnz(axis=0) |
|
|
|
if self.sparse is False: |
|
imputer_mask = imputer_mask.toarray() |
|
elif imputer_mask.format == "csr": |
|
imputer_mask = imputer_mask.tocsc() |
|
else: |
|
if not self._precomputed: |
|
imputer_mask = _get_mask(X, self.missing_values) |
|
else: |
|
imputer_mask = X |
|
|
|
if self.features == "missing-only": |
|
n_missing = imputer_mask.sum(axis=0) |
|
|
|
if self.sparse is True: |
|
imputer_mask = sp.csc_matrix(imputer_mask) |
|
|
|
if self.features == "all": |
|
features_indices = np.arange(X.shape[1]) |
|
else: |
|
features_indices = np.flatnonzero(n_missing) |
|
|
|
return imputer_mask, features_indices |
|
|
|
def _validate_input(self, X, in_fit): |
|
if not is_scalar_nan(self.missing_values): |
|
ensure_all_finite = True |
|
else: |
|
ensure_all_finite = "allow-nan" |
|
X = validate_data( |
|
self, |
|
X, |
|
reset=in_fit, |
|
accept_sparse=("csc", "csr"), |
|
dtype=None, |
|
ensure_all_finite=ensure_all_finite, |
|
) |
|
_check_inputs_dtype(X, self.missing_values) |
|
if X.dtype.kind not in ("i", "u", "f", "O"): |
|
raise ValueError( |
|
"MissingIndicator does not support data with " |
|
"dtype {0}. Please provide either a numeric array" |
|
" (with a floating point or integer dtype) or " |
|
"categorical data represented either as an array " |
|
"with integer dtype or an array of string values " |
|
"with an object dtype.".format(X.dtype) |
|
) |
|
|
|
if sp.issparse(X) and self.missing_values == 0: |
|
|
|
|
|
raise ValueError( |
|
"Sparse input with missing_values=0 is " |
|
"not supported. Provide a dense " |
|
"array instead." |
|
) |
|
|
|
return X |
|
|
|
def _fit(self, X, y=None, precomputed=False): |
|
"""Fit the transformer on `X`. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features) |
|
Input data, where `n_samples` is the number of samples and |
|
`n_features` is the number of features. |
|
If `precomputed=True`, then `X` is a mask of the input data. |
|
|
|
precomputed : bool |
|
Whether the input data is a mask. |
|
|
|
Returns |
|
------- |
|
imputer_mask : {ndarray, sparse matrix} of shape (n_samples, \ |
|
n_features) |
|
The imputer mask of the original data. |
|
""" |
|
if precomputed: |
|
if not (hasattr(X, "dtype") and X.dtype.kind == "b"): |
|
raise ValueError("precomputed is True but the input data is not a mask") |
|
self._precomputed = True |
|
else: |
|
self._precomputed = False |
|
|
|
|
|
|
|
if not self._precomputed: |
|
X = self._validate_input(X, in_fit=True) |
|
else: |
|
|
|
_check_n_features(self, X, reset=True) |
|
|
|
self._n_features = X.shape[1] |
|
|
|
missing_features_info = self._get_missing_features_info(X) |
|
self.features_ = missing_features_info[1] |
|
|
|
return missing_features_info[0] |
|
|
|
@_fit_context(prefer_skip_nested_validation=True) |
|
def fit(self, X, y=None): |
|
"""Fit the transformer on `X`. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features) |
|
Input data, where `n_samples` is the number of samples and |
|
`n_features` is the number of features. |
|
|
|
y : Ignored |
|
Not used, present for API consistency by convention. |
|
|
|
Returns |
|
------- |
|
self : object |
|
Fitted estimator. |
|
""" |
|
self._fit(X, y) |
|
|
|
return self |
|
|
|
def transform(self, X): |
|
"""Generate missing values indicator for `X`. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features) |
|
The input data to complete. |
|
|
|
Returns |
|
------- |
|
Xt : {ndarray, sparse matrix} of shape (n_samples, n_features) \ |
|
or (n_samples, n_features_with_missing) |
|
The missing indicator for input data. The data type of `Xt` |
|
will be boolean. |
|
""" |
|
check_is_fitted(self) |
|
|
|
|
|
|
|
if not self._precomputed: |
|
X = self._validate_input(X, in_fit=False) |
|
else: |
|
if not (hasattr(X, "dtype") and X.dtype.kind == "b"): |
|
raise ValueError("precomputed is True but the input data is not a mask") |
|
|
|
imputer_mask, features = self._get_missing_features_info(X) |
|
|
|
if self.features == "missing-only": |
|
features_diff_fit_trans = np.setdiff1d(features, self.features_) |
|
if self.error_on_new and features_diff_fit_trans.size > 0: |
|
raise ValueError( |
|
"The features {} have missing values " |
|
"in transform but have no missing values " |
|
"in fit.".format(features_diff_fit_trans) |
|
) |
|
|
|
if self.features_.size < self._n_features: |
|
imputer_mask = imputer_mask[:, self.features_] |
|
|
|
return imputer_mask |
|
|
|
@_fit_context(prefer_skip_nested_validation=True) |
|
def fit_transform(self, X, y=None): |
|
"""Generate missing values indicator for `X`. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_features) |
|
The input data to complete. |
|
|
|
y : Ignored |
|
Not used, present for API consistency by convention. |
|
|
|
Returns |
|
------- |
|
Xt : {ndarray, sparse matrix} of shape (n_samples, n_features) \ |
|
or (n_samples, n_features_with_missing) |
|
The missing indicator for input data. The data type of `Xt` |
|
will be boolean. |
|
""" |
|
imputer_mask = self._fit(X, y) |
|
|
|
if self.features_.size < self._n_features: |
|
imputer_mask = imputer_mask[:, self.features_] |
|
|
|
return imputer_mask |
|
|
|
def get_feature_names_out(self, input_features=None): |
|
"""Get output feature names for transformation. |
|
|
|
Parameters |
|
---------- |
|
input_features : array-like of str or None, default=None |
|
Input features. |
|
|
|
- If `input_features` is `None`, then `feature_names_in_` is |
|
used as feature names in. If `feature_names_in_` is not defined, |
|
then the following input feature names are generated: |
|
`["x0", "x1", ..., "x(n_features_in_ - 1)"]`. |
|
- If `input_features` is an array-like, then `input_features` must |
|
match `feature_names_in_` if `feature_names_in_` is defined. |
|
|
|
Returns |
|
------- |
|
feature_names_out : ndarray of str objects |
|
Transformed feature names. |
|
""" |
|
check_is_fitted(self, "n_features_in_") |
|
input_features = _check_feature_names_in(self, input_features) |
|
prefix = self.__class__.__name__.lower() |
|
return np.asarray( |
|
[ |
|
f"{prefix}_{feature_name}" |
|
for feature_name in input_features[self.features_] |
|
], |
|
dtype=object, |
|
) |
|
|
|
def __sklearn_tags__(self): |
|
tags = super().__sklearn_tags__() |
|
tags.input_tags.allow_nan = True |
|
tags.input_tags.string = True |
|
tags.input_tags.sparse = True |
|
tags.transformer_tags.preserves_dtype = [] |
|
return tags |
|
|