|
"""Functions to validate input and parameters within scikit-learn estimators.""" |
|
|
|
|
|
|
|
|
|
import numbers |
|
import operator |
|
import sys |
|
import warnings |
|
from collections.abc import Sequence |
|
from contextlib import suppress |
|
from functools import reduce, wraps |
|
from inspect import Parameter, isclass, signature |
|
|
|
import joblib |
|
import numpy as np |
|
import scipy.sparse as sp |
|
|
|
from .. import get_config as _get_config |
|
from ..exceptions import DataConversionWarning, NotFittedError, PositiveSpectrumWarning |
|
from ..utils._array_api import _asarray_with_order, _is_numpy_namespace, get_namespace |
|
from ..utils.deprecation import _deprecate_force_all_finite |
|
from ..utils.fixes import ComplexWarning, _preserve_dia_indices_dtype |
|
from ._isfinite import FiniteStatus, cy_isfinite |
|
from ._tags import get_tags |
|
from .fixes import _object_dtype_isnan |
|
|
|
FLOAT_DTYPES = (np.float64, np.float32, np.float16) |
|
|
|
|
|
|
|
|
|
|
|
def _deprecate_positional_args(func=None, *, version="1.3"): |
|
"""Decorator for methods that issues warnings for positional arguments. |
|
|
|
Using the keyword-only argument syntax in pep 3102, arguments after the |
|
* will issue a warning when passed as a positional argument. |
|
|
|
Parameters |
|
---------- |
|
func : callable, default=None |
|
Function to check arguments on. |
|
version : callable, default="1.3" |
|
The version when positional arguments will result in error. |
|
""" |
|
|
|
def _inner_deprecate_positional_args(f): |
|
sig = signature(f) |
|
kwonly_args = [] |
|
all_args = [] |
|
|
|
for name, param in sig.parameters.items(): |
|
if param.kind == Parameter.POSITIONAL_OR_KEYWORD: |
|
all_args.append(name) |
|
elif param.kind == Parameter.KEYWORD_ONLY: |
|
kwonly_args.append(name) |
|
|
|
@wraps(f) |
|
def inner_f(*args, **kwargs): |
|
extra_args = len(args) - len(all_args) |
|
if extra_args <= 0: |
|
return f(*args, **kwargs) |
|
|
|
|
|
args_msg = [ |
|
"{}={}".format(name, arg) |
|
for name, arg in zip(kwonly_args[:extra_args], args[-extra_args:]) |
|
] |
|
args_msg = ", ".join(args_msg) |
|
warnings.warn( |
|
( |
|
f"Pass {args_msg} as keyword args. From version " |
|
f"{version} passing these as positional arguments " |
|
"will result in an error" |
|
), |
|
FutureWarning, |
|
) |
|
kwargs.update(zip(sig.parameters, args)) |
|
return f(**kwargs) |
|
|
|
return inner_f |
|
|
|
if func is not None: |
|
return _inner_deprecate_positional_args(func) |
|
|
|
return _inner_deprecate_positional_args |
|
|
|
|
|
def _assert_all_finite( |
|
X, allow_nan=False, msg_dtype=None, estimator_name=None, input_name="" |
|
): |
|
"""Like assert_all_finite, but only for ndarray.""" |
|
|
|
xp, is_array_api = get_namespace(X) |
|
|
|
if _get_config()["assume_finite"]: |
|
return |
|
|
|
X = xp.asarray(X) |
|
|
|
|
|
if not is_array_api and X.dtype == np.dtype("object") and not allow_nan: |
|
if _object_dtype_isnan(X).any(): |
|
raise ValueError("Input contains NaN") |
|
|
|
|
|
if not xp.isdtype(X.dtype, ("real floating", "complex floating")): |
|
return |
|
|
|
|
|
|
|
|
|
|
|
with np.errstate(over="ignore"): |
|
first_pass_isfinite = xp.isfinite(xp.sum(X)) |
|
if first_pass_isfinite: |
|
return |
|
|
|
_assert_all_finite_element_wise( |
|
X, |
|
xp=xp, |
|
allow_nan=allow_nan, |
|
msg_dtype=msg_dtype, |
|
estimator_name=estimator_name, |
|
input_name=input_name, |
|
) |
|
|
|
|
|
def _assert_all_finite_element_wise( |
|
X, *, xp, allow_nan, msg_dtype=None, estimator_name=None, input_name="" |
|
): |
|
|
|
use_cython = ( |
|
xp is np and X.data.contiguous and X.dtype.type in {np.float32, np.float64} |
|
) |
|
if use_cython: |
|
out = cy_isfinite(X.reshape(-1), allow_nan=allow_nan) |
|
has_nan_error = False if allow_nan else out == FiniteStatus.has_nan |
|
has_inf = out == FiniteStatus.has_infinite |
|
else: |
|
has_inf = xp.any(xp.isinf(X)) |
|
has_nan_error = False if allow_nan else xp.any(xp.isnan(X)) |
|
if has_inf or has_nan_error: |
|
if has_nan_error: |
|
type_err = "NaN" |
|
else: |
|
msg_dtype = msg_dtype if msg_dtype is not None else X.dtype |
|
type_err = f"infinity or a value too large for {msg_dtype!r}" |
|
padded_input_name = input_name + " " if input_name else "" |
|
msg_err = f"Input {padded_input_name}contains {type_err}." |
|
if estimator_name and input_name == "X" and has_nan_error: |
|
|
|
|
|
msg_err += ( |
|
f"\n{estimator_name} does not accept missing values" |
|
" encoded as NaN natively. For supervised learning, you might want" |
|
" to consider sklearn.ensemble.HistGradientBoostingClassifier and" |
|
" Regressor which accept missing values encoded as NaNs natively." |
|
" Alternatively, it is possible to preprocess the data, for" |
|
" instance by using an imputer transformer in a pipeline or drop" |
|
" samples with missing values. See" |
|
" https://scikit-learn.org/stable/modules/impute.html" |
|
" You can find a list of all estimators that handle NaN values" |
|
" at the following page:" |
|
" https://scikit-learn.org/stable/modules/impute.html" |
|
"#estimators-that-handle-nan-values" |
|
) |
|
raise ValueError(msg_err) |
|
|
|
|
|
def assert_all_finite( |
|
X, |
|
*, |
|
allow_nan=False, |
|
estimator_name=None, |
|
input_name="", |
|
): |
|
"""Throw a ValueError if X contains NaN or infinity. |
|
|
|
Parameters |
|
---------- |
|
X : {ndarray, sparse matrix} |
|
The input data. |
|
|
|
allow_nan : bool, default=False |
|
If True, do not throw error when `X` contains NaN. |
|
|
|
estimator_name : str, default=None |
|
The estimator name, used to construct the error message. |
|
|
|
input_name : str, default="" |
|
The data name used to construct the error message. In particular |
|
if `input_name` is "X" and the data has NaN values and |
|
allow_nan is False, the error message will link to the imputer |
|
documentation. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.utils import assert_all_finite |
|
>>> import numpy as np |
|
>>> array = np.array([1, np.inf, np.nan, 4]) |
|
>>> try: |
|
... assert_all_finite(array) |
|
... print("Test passed: Array contains only finite values.") |
|
... except ValueError: |
|
... print("Test failed: Array contains non-finite values.") |
|
Test failed: Array contains non-finite values. |
|
""" |
|
_assert_all_finite( |
|
X.data if sp.issparse(X) else X, |
|
allow_nan=allow_nan, |
|
estimator_name=estimator_name, |
|
input_name=input_name, |
|
) |
|
|
|
|
|
def as_float_array( |
|
X, *, copy=True, force_all_finite="deprecated", ensure_all_finite=None |
|
): |
|
"""Convert an array-like to an array of floats. |
|
|
|
The new dtype will be np.float32 or np.float64, depending on the original |
|
type. The function can create a copy or modify the argument depending |
|
on the argument copy. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} |
|
The input data. |
|
|
|
copy : bool, default=True |
|
If True, a copy of X will be created. If False, a copy may still be |
|
returned if X's dtype is not a floating point type. |
|
|
|
force_all_finite : bool or 'allow-nan', default=True |
|
Whether to raise an error on np.inf, np.nan, pd.NA in X. The |
|
possibilities are: |
|
|
|
- True: Force all values of X to be finite. |
|
- False: accepts np.inf, np.nan, pd.NA in X. |
|
- 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot |
|
be infinite. |
|
|
|
.. versionadded:: 0.20 |
|
``force_all_finite`` accepts the string ``'allow-nan'``. |
|
|
|
.. versionchanged:: 0.23 |
|
Accepts `pd.NA` and converts it into `np.nan` |
|
|
|
.. deprecated:: 1.6 |
|
`force_all_finite` was renamed to `ensure_all_finite` and will be removed |
|
in 1.8. |
|
|
|
ensure_all_finite : bool or 'allow-nan', default=True |
|
Whether to raise an error on np.inf, np.nan, pd.NA in X. The |
|
possibilities are: |
|
|
|
- True: Force all values of X to be finite. |
|
- False: accepts np.inf, np.nan, pd.NA in X. |
|
- 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot |
|
be infinite. |
|
|
|
.. versionadded:: 1.6 |
|
`force_all_finite` was renamed to `ensure_all_finite`. |
|
|
|
Returns |
|
------- |
|
XT : {ndarray, sparse matrix} |
|
An array of type float. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.utils import as_float_array |
|
>>> import numpy as np |
|
>>> array = np.array([0, 0, 1, 2, 2], dtype=np.int64) |
|
>>> as_float_array(array) |
|
array([0., 0., 1., 2., 2.]) |
|
""" |
|
ensure_all_finite = _deprecate_force_all_finite(force_all_finite, ensure_all_finite) |
|
|
|
if isinstance(X, np.matrix) or ( |
|
not isinstance(X, np.ndarray) and not sp.issparse(X) |
|
): |
|
return check_array( |
|
X, |
|
accept_sparse=["csr", "csc", "coo"], |
|
dtype=np.float64, |
|
copy=copy, |
|
ensure_all_finite=ensure_all_finite, |
|
ensure_2d=False, |
|
) |
|
elif sp.issparse(X) and X.dtype in [np.float32, np.float64]: |
|
return X.copy() if copy else X |
|
elif X.dtype in [np.float32, np.float64]: |
|
return X.copy("F" if X.flags["F_CONTIGUOUS"] else "C") if copy else X |
|
else: |
|
if X.dtype.kind in "uib" and X.dtype.itemsize <= 4: |
|
return_dtype = np.float32 |
|
else: |
|
return_dtype = np.float64 |
|
return X.astype(return_dtype) |
|
|
|
|
|
def _is_arraylike(x): |
|
"""Returns whether the input is array-like.""" |
|
if sp.issparse(x): |
|
return False |
|
|
|
return hasattr(x, "__len__") or hasattr(x, "shape") or hasattr(x, "__array__") |
|
|
|
|
|
def _is_arraylike_not_scalar(array): |
|
"""Return True if array is array-like and not a scalar""" |
|
return _is_arraylike(array) and not np.isscalar(array) |
|
|
|
|
|
def _use_interchange_protocol(X): |
|
"""Use interchange protocol for non-pandas dataframes that follow the protocol. |
|
|
|
Note: at this point we chose not to use the interchange API on pandas dataframe |
|
to ensure strict behavioral backward compatibility with older versions of |
|
scikit-learn. |
|
""" |
|
return not _is_pandas_df(X) and hasattr(X, "__dataframe__") |
|
|
|
|
|
def _num_features(X): |
|
"""Return the number of features in an array-like X. |
|
|
|
This helper function tries hard to avoid to materialize an array version |
|
of X unless necessary. For instance, if X is a list of lists, |
|
this function will return the length of the first element, assuming |
|
that subsequent elements are all lists of the same length without |
|
checking. |
|
Parameters |
|
---------- |
|
X : array-like |
|
array-like to get the number of features. |
|
|
|
Returns |
|
------- |
|
features : int |
|
Number of features |
|
""" |
|
type_ = type(X) |
|
if type_.__module__ == "builtins": |
|
type_name = type_.__qualname__ |
|
else: |
|
type_name = f"{type_.__module__}.{type_.__qualname__}" |
|
message = f"Unable to find the number of features from X of type {type_name}" |
|
if not hasattr(X, "__len__") and not hasattr(X, "shape"): |
|
if not hasattr(X, "__array__"): |
|
raise TypeError(message) |
|
|
|
|
|
X = np.asarray(X) |
|
|
|
if hasattr(X, "shape"): |
|
if not hasattr(X.shape, "__len__") or len(X.shape) <= 1: |
|
message += f" with shape {X.shape}" |
|
raise TypeError(message) |
|
return X.shape[1] |
|
|
|
first_sample = X[0] |
|
|
|
|
|
if isinstance(first_sample, (str, bytes, dict)): |
|
message += f" where the samples are of type {type(first_sample).__qualname__}" |
|
raise TypeError(message) |
|
|
|
try: |
|
|
|
|
|
|
|
return len(first_sample) |
|
except Exception as err: |
|
raise TypeError(message) from err |
|
|
|
|
|
def _num_samples(x): |
|
"""Return number of samples in array-like x.""" |
|
message = "Expected sequence or array-like, got %s" % type(x) |
|
if hasattr(x, "fit") and callable(x.fit): |
|
|
|
raise TypeError(message) |
|
|
|
if _use_interchange_protocol(x): |
|
return x.__dataframe__().num_rows() |
|
|
|
if not hasattr(x, "__len__") and not hasattr(x, "shape"): |
|
if hasattr(x, "__array__"): |
|
x = np.asarray(x) |
|
else: |
|
raise TypeError(message) |
|
|
|
if hasattr(x, "shape") and x.shape is not None: |
|
if len(x.shape) == 0: |
|
raise TypeError( |
|
"Input should have at least 1 dimension i.e. satisfy " |
|
f"`len(x.shape) > 0`, got scalar `{x!r}` instead." |
|
) |
|
|
|
|
|
if isinstance(x.shape[0], numbers.Integral): |
|
return x.shape[0] |
|
|
|
try: |
|
return len(x) |
|
except TypeError as type_error: |
|
raise TypeError(message) from type_error |
|
|
|
|
|
def check_memory(memory): |
|
"""Check that ``memory`` is joblib.Memory-like. |
|
|
|
joblib.Memory-like means that ``memory`` can be converted into a |
|
joblib.Memory instance (typically a str denoting the ``location``) |
|
or has the same interface (has a ``cache`` method). |
|
|
|
Parameters |
|
---------- |
|
memory : None, str or object with the joblib.Memory interface |
|
- If string, the location where to create the `joblib.Memory` interface. |
|
- If None, no caching is done and the Memory object is completely transparent. |
|
|
|
Returns |
|
------- |
|
memory : object with the joblib.Memory interface |
|
A correct joblib.Memory object. |
|
|
|
Raises |
|
------ |
|
ValueError |
|
If ``memory`` is not joblib.Memory-like. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.utils.validation import check_memory |
|
>>> check_memory("caching_dir") |
|
Memory(location=caching_dir/joblib) |
|
""" |
|
if memory is None or isinstance(memory, str): |
|
memory = joblib.Memory(location=memory, verbose=0) |
|
elif not hasattr(memory, "cache"): |
|
raise ValueError( |
|
"'memory' should be None, a string or have the same" |
|
" interface as joblib.Memory." |
|
" Got memory='{}' instead.".format(memory) |
|
) |
|
return memory |
|
|
|
|
|
def check_consistent_length(*arrays): |
|
"""Check that all arrays have consistent first dimensions. |
|
|
|
Checks whether all objects in arrays have the same shape or length. |
|
|
|
Parameters |
|
---------- |
|
*arrays : list or tuple of input objects. |
|
Objects that will be checked for consistent length. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.utils.validation import check_consistent_length |
|
>>> a = [1, 2, 3] |
|
>>> b = [2, 3, 4] |
|
>>> check_consistent_length(a, b) |
|
""" |
|
|
|
lengths = [_num_samples(X) for X in arrays if X is not None] |
|
uniques = np.unique(lengths) |
|
if len(uniques) > 1: |
|
raise ValueError( |
|
"Found input variables with inconsistent numbers of samples: %r" |
|
% [int(l) for l in lengths] |
|
) |
|
|
|
|
|
def _make_indexable(iterable): |
|
"""Ensure iterable supports indexing or convert to an indexable variant. |
|
|
|
Convert sparse matrices to csr and other non-indexable iterable to arrays. |
|
Let `None` and indexable objects (e.g. pandas dataframes) pass unchanged. |
|
|
|
Parameters |
|
---------- |
|
iterable : {list, dataframe, ndarray, sparse matrix} or None |
|
Object to be converted to an indexable iterable. |
|
""" |
|
if sp.issparse(iterable): |
|
return iterable.tocsr() |
|
elif hasattr(iterable, "__getitem__") or hasattr(iterable, "iloc"): |
|
return iterable |
|
elif iterable is None: |
|
return iterable |
|
return np.array(iterable) |
|
|
|
|
|
def indexable(*iterables): |
|
"""Make arrays indexable for cross-validation. |
|
|
|
Checks consistent length, passes through None, and ensures that everything |
|
can be indexed by converting sparse matrices to csr and converting |
|
non-iterable objects to arrays. |
|
|
|
Parameters |
|
---------- |
|
*iterables : {lists, dataframes, ndarrays, sparse matrices} |
|
List of objects to ensure sliceability. |
|
|
|
Returns |
|
------- |
|
result : list of {ndarray, sparse matrix, dataframe} or None |
|
Returns a list containing indexable arrays (i.e. NumPy array, |
|
sparse matrix, or dataframe) or `None`. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.utils import indexable |
|
>>> from scipy.sparse import csr_matrix |
|
>>> import numpy as np |
|
>>> iterables = [ |
|
... [1, 2, 3], np.array([2, 3, 4]), None, csr_matrix([[5], [6], [7]]) |
|
... ] |
|
>>> indexable(*iterables) |
|
[[1, 2, 3], array([2, 3, 4]), None, <...Sparse...dtype 'int64'...shape (3, 1)>] |
|
""" |
|
|
|
result = [_make_indexable(X) for X in iterables] |
|
check_consistent_length(*result) |
|
return result |
|
|
|
|
|
def _ensure_sparse_format( |
|
sparse_container, |
|
accept_sparse, |
|
dtype, |
|
copy, |
|
ensure_all_finite, |
|
accept_large_sparse, |
|
estimator_name=None, |
|
input_name="", |
|
): |
|
"""Convert a sparse container to a given format. |
|
|
|
Checks the sparse format of `sparse_container` and converts if necessary. |
|
|
|
Parameters |
|
---------- |
|
sparse_container : sparse matrix or array |
|
Input to validate and convert. |
|
|
|
accept_sparse : str, bool or list/tuple of str |
|
String[s] representing allowed sparse matrix formats ('csc', |
|
'csr', 'coo', 'dok', 'bsr', 'lil', 'dia'). If the input is sparse but |
|
not in the allowed format, it will be converted to the first listed |
|
format. True allows the input to be any format. False means |
|
that a sparse matrix input will raise an error. |
|
|
|
dtype : str, type or None |
|
Data type of result. If None, the dtype of the input is preserved. |
|
|
|
copy : bool |
|
Whether a forced copy will be triggered. If copy=False, a copy might |
|
be triggered by a conversion. |
|
|
|
ensure_all_finite : bool or 'allow-nan' |
|
Whether to raise an error on np.inf, np.nan, pd.NA in X. The |
|
possibilities are: |
|
|
|
- True: Force all values of X to be finite. |
|
- False: accepts np.inf, np.nan, pd.NA in X. |
|
- 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot |
|
be infinite. |
|
|
|
.. versionadded:: 0.20 |
|
``ensure_all_finite`` accepts the string ``'allow-nan'``. |
|
|
|
.. versionchanged:: 0.23 |
|
Accepts `pd.NA` and converts it into `np.nan` |
|
|
|
|
|
estimator_name : str, default=None |
|
The estimator name, used to construct the error message. |
|
|
|
input_name : str, default="" |
|
The data name used to construct the error message. In particular |
|
if `input_name` is "X" and the data has NaN values and |
|
allow_nan is False, the error message will link to the imputer |
|
documentation. |
|
|
|
Returns |
|
------- |
|
sparse_container_converted : sparse matrix or array |
|
Sparse container (matrix/array) that is ensured to have an allowed type. |
|
""" |
|
if dtype is None: |
|
dtype = sparse_container.dtype |
|
|
|
changed_format = False |
|
sparse_container_type_name = type(sparse_container).__name__ |
|
|
|
if isinstance(accept_sparse, str): |
|
accept_sparse = [accept_sparse] |
|
|
|
|
|
_check_large_sparse(sparse_container, accept_large_sparse) |
|
|
|
if accept_sparse is False: |
|
padded_input = " for " + input_name if input_name else "" |
|
raise TypeError( |
|
f"Sparse data was passed{padded_input}, but dense data is required. " |
|
"Use '.toarray()' to convert to a dense numpy array." |
|
) |
|
elif isinstance(accept_sparse, (list, tuple)): |
|
if len(accept_sparse) == 0: |
|
raise ValueError( |
|
"When providing 'accept_sparse' as a tuple or list, it must contain at " |
|
"least one string value." |
|
) |
|
|
|
if sparse_container.format not in accept_sparse: |
|
|
|
sparse_container = sparse_container.asformat(accept_sparse[0]) |
|
changed_format = True |
|
elif accept_sparse is not True: |
|
|
|
raise ValueError( |
|
"Parameter 'accept_sparse' should be a string, boolean or list of strings." |
|
f" You provided 'accept_sparse={accept_sparse}'." |
|
) |
|
|
|
if dtype != sparse_container.dtype: |
|
|
|
sparse_container = sparse_container.astype(dtype) |
|
elif copy and not changed_format: |
|
|
|
sparse_container = sparse_container.copy() |
|
|
|
if ensure_all_finite: |
|
if not hasattr(sparse_container, "data"): |
|
warnings.warn( |
|
f"Can't check {sparse_container.format} sparse matrix for nan or inf.", |
|
stacklevel=2, |
|
) |
|
else: |
|
_assert_all_finite( |
|
sparse_container.data, |
|
allow_nan=ensure_all_finite == "allow-nan", |
|
estimator_name=estimator_name, |
|
input_name=input_name, |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if changed_format: |
|
|
|
requested_sparse_format = accept_sparse[0] |
|
_preserve_dia_indices_dtype( |
|
sparse_container, sparse_container_type_name, requested_sparse_format |
|
) |
|
|
|
return sparse_container |
|
|
|
|
|
def _ensure_no_complex_data(array): |
|
if ( |
|
hasattr(array, "dtype") |
|
and array.dtype is not None |
|
and hasattr(array.dtype, "kind") |
|
and array.dtype.kind == "c" |
|
): |
|
raise ValueError("Complex data not supported\n{}\n".format(array)) |
|
|
|
|
|
def _check_estimator_name(estimator): |
|
if estimator is not None: |
|
if isinstance(estimator, str): |
|
return estimator |
|
else: |
|
return estimator.__class__.__name__ |
|
return None |
|
|
|
|
|
def _pandas_dtype_needs_early_conversion(pd_dtype): |
|
"""Return True if pandas extension pd_dtype need to be converted early.""" |
|
|
|
from pandas import SparseDtype |
|
from pandas.api.types import ( |
|
is_bool_dtype, |
|
is_float_dtype, |
|
is_integer_dtype, |
|
) |
|
|
|
if is_bool_dtype(pd_dtype): |
|
|
|
|
|
return True |
|
|
|
if isinstance(pd_dtype, SparseDtype): |
|
|
|
return False |
|
|
|
try: |
|
from pandas.api.types import is_extension_array_dtype |
|
except ImportError: |
|
return False |
|
|
|
if isinstance(pd_dtype, SparseDtype) or not is_extension_array_dtype(pd_dtype): |
|
|
|
|
|
return False |
|
elif is_float_dtype(pd_dtype): |
|
|
|
|
|
return True |
|
elif is_integer_dtype(pd_dtype): |
|
|
|
return True |
|
|
|
return False |
|
|
|
|
|
def _is_extension_array_dtype(array): |
|
|
|
return hasattr(array, "dtype") and hasattr(array.dtype, "na_value") |
|
|
|
|
|
def check_array( |
|
array, |
|
accept_sparse=False, |
|
*, |
|
accept_large_sparse=True, |
|
dtype="numeric", |
|
order=None, |
|
copy=False, |
|
force_writeable=False, |
|
force_all_finite="deprecated", |
|
ensure_all_finite=None, |
|
ensure_non_negative=False, |
|
ensure_2d=True, |
|
allow_nd=False, |
|
ensure_min_samples=1, |
|
ensure_min_features=1, |
|
estimator=None, |
|
input_name="", |
|
): |
|
"""Input validation on an array, list, sparse matrix or similar. |
|
|
|
By default, the input is checked to be a non-empty 2D array containing |
|
only finite values. If the dtype of the array is object, attempt |
|
converting to float, raising on failure. |
|
|
|
Parameters |
|
---------- |
|
array : object |
|
Input object to check / convert. |
|
|
|
accept_sparse : str, bool or list/tuple of str, default=False |
|
String[s] representing allowed sparse matrix formats, such as 'csc', |
|
'csr', etc. If the input is sparse but not in the allowed format, |
|
it will be converted to the first listed format. True allows the input |
|
to be any format. False means that a sparse matrix input will |
|
raise an error. |
|
|
|
accept_large_sparse : bool, default=True |
|
If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by |
|
accept_sparse, accept_large_sparse=False will cause it to be accepted |
|
only if its indices are stored with a 32-bit dtype. |
|
|
|
.. versionadded:: 0.20 |
|
|
|
dtype : 'numeric', type, list of type or None, default='numeric' |
|
Data type of result. If None, the dtype of the input is preserved. |
|
If "numeric", dtype is preserved unless array.dtype is object. |
|
If dtype is a list of types, conversion on the first type is only |
|
performed if the dtype of the input is not in the list. |
|
|
|
order : {'F', 'C'} or None, default=None |
|
Whether an array will be forced to be fortran or c-style. |
|
When order is None (default), then if copy=False, nothing is ensured |
|
about the memory layout of the output array; otherwise (copy=True) |
|
the memory layout of the returned array is kept as close as possible |
|
to the original array. |
|
|
|
copy : bool, default=False |
|
Whether a forced copy will be triggered. If copy=False, a copy might |
|
be triggered by a conversion. |
|
|
|
force_writeable : bool, default=False |
|
Whether to force the output array to be writeable. If True, the returned array |
|
is guaranteed to be writeable, which may require a copy. Otherwise the |
|
writeability of the input array is preserved. |
|
|
|
.. versionadded:: 1.6 |
|
|
|
force_all_finite : bool or 'allow-nan', default=True |
|
Whether to raise an error on np.inf, np.nan, pd.NA in array. The |
|
possibilities are: |
|
|
|
- True: Force all values of array to be finite. |
|
- False: accepts np.inf, np.nan, pd.NA in array. |
|
- 'allow-nan': accepts only np.nan and pd.NA values in array. Values |
|
cannot be infinite. |
|
|
|
.. versionadded:: 0.20 |
|
``force_all_finite`` accepts the string ``'allow-nan'``. |
|
|
|
.. versionchanged:: 0.23 |
|
Accepts `pd.NA` and converts it into `np.nan` |
|
|
|
.. deprecated:: 1.6 |
|
`force_all_finite` was renamed to `ensure_all_finite` and will be removed |
|
in 1.8. |
|
|
|
ensure_all_finite : bool or 'allow-nan', default=True |
|
Whether to raise an error on np.inf, np.nan, pd.NA in array. The |
|
possibilities are: |
|
|
|
- True: Force all values of array to be finite. |
|
- False: accepts np.inf, np.nan, pd.NA in array. |
|
- 'allow-nan': accepts only np.nan and pd.NA values in array. Values |
|
cannot be infinite. |
|
|
|
.. versionadded:: 1.6 |
|
`force_all_finite` was renamed to `ensure_all_finite`. |
|
|
|
ensure_non_negative : bool, default=False |
|
Make sure the array has only non-negative values. If True, an array that |
|
contains negative values will raise a ValueError. |
|
|
|
.. versionadded:: 1.6 |
|
|
|
ensure_2d : bool, default=True |
|
Whether to raise a value error if array is not 2D. |
|
|
|
allow_nd : bool, default=False |
|
Whether to allow array.ndim > 2. |
|
|
|
ensure_min_samples : int, default=1 |
|
Make sure that the array has a minimum number of samples in its first |
|
axis (rows for a 2D array). Setting to 0 disables this check. |
|
|
|
ensure_min_features : int, default=1 |
|
Make sure that the 2D array has some minimum number of features |
|
(columns). The default value of 1 rejects empty datasets. |
|
This check is only enforced when the input data has effectively 2 |
|
dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0 |
|
disables this check. |
|
|
|
estimator : str or estimator instance, default=None |
|
If passed, include the name of the estimator in warning messages. |
|
|
|
input_name : str, default="" |
|
The data name used to construct the error message. In particular |
|
if `input_name` is "X" and the data has NaN values and |
|
allow_nan is False, the error message will link to the imputer |
|
documentation. |
|
|
|
.. versionadded:: 1.1.0 |
|
|
|
Returns |
|
------- |
|
array_converted : object |
|
The converted and validated array. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.utils.validation import check_array |
|
>>> X = [[1, 2, 3], [4, 5, 6]] |
|
>>> X_checked = check_array(X) |
|
>>> X_checked |
|
array([[1, 2, 3], [4, 5, 6]]) |
|
""" |
|
ensure_all_finite = _deprecate_force_all_finite(force_all_finite, ensure_all_finite) |
|
|
|
if isinstance(array, np.matrix): |
|
raise TypeError( |
|
"np.matrix is not supported. Please convert to a numpy array with " |
|
"np.asarray. For more information see: " |
|
"https://numpy.org/doc/stable/reference/generated/numpy.matrix.html" |
|
) |
|
|
|
xp, is_array_api_compliant = get_namespace(array) |
|
|
|
|
|
|
|
array_orig = array |
|
|
|
|
|
dtype_numeric = isinstance(dtype, str) and dtype == "numeric" |
|
|
|
dtype_orig = getattr(array, "dtype", None) |
|
if not is_array_api_compliant and not hasattr(dtype_orig, "kind"): |
|
|
|
dtype_orig = None |
|
|
|
|
|
|
|
dtypes_orig = None |
|
pandas_requires_conversion = False |
|
|
|
type_if_series = None |
|
if hasattr(array, "dtypes") and hasattr(array.dtypes, "__array__"): |
|
|
|
|
|
with suppress(ImportError): |
|
from pandas import SparseDtype |
|
|
|
def is_sparse(dtype): |
|
return isinstance(dtype, SparseDtype) |
|
|
|
if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any(): |
|
warnings.warn( |
|
"pandas.DataFrame with sparse columns found." |
|
"It will be converted to a dense numpy array." |
|
) |
|
|
|
dtypes_orig = list(array.dtypes) |
|
pandas_requires_conversion = any( |
|
_pandas_dtype_needs_early_conversion(i) for i in dtypes_orig |
|
) |
|
if all(isinstance(dtype_iter, np.dtype) for dtype_iter in dtypes_orig): |
|
dtype_orig = np.result_type(*dtypes_orig) |
|
elif pandas_requires_conversion and any(d == object for d in dtypes_orig): |
|
|
|
dtype_orig = object |
|
|
|
elif (_is_extension_array_dtype(array) or hasattr(array, "iloc")) and hasattr( |
|
array, "dtype" |
|
): |
|
|
|
type_if_series = type(array) |
|
pandas_requires_conversion = _pandas_dtype_needs_early_conversion(array.dtype) |
|
if isinstance(array.dtype, np.dtype): |
|
dtype_orig = array.dtype |
|
else: |
|
|
|
dtype_orig = None |
|
|
|
if dtype_numeric: |
|
if ( |
|
dtype_orig is not None |
|
and hasattr(dtype_orig, "kind") |
|
and dtype_orig.kind == "O" |
|
): |
|
|
|
dtype = xp.float64 |
|
else: |
|
dtype = None |
|
|
|
if isinstance(dtype, (list, tuple)): |
|
if dtype_orig is not None and dtype_orig in dtype: |
|
|
|
dtype = None |
|
else: |
|
|
|
|
|
dtype = dtype[0] |
|
|
|
if pandas_requires_conversion: |
|
|
|
|
|
|
|
new_dtype = dtype_orig if dtype is None else dtype |
|
array = array.astype(new_dtype) |
|
|
|
dtype = None |
|
|
|
if ensure_all_finite not in (True, False, "allow-nan"): |
|
raise ValueError( |
|
"ensure_all_finite should be a bool or 'allow-nan'. Got " |
|
f"{ensure_all_finite!r} instead." |
|
) |
|
|
|
if dtype is not None and _is_numpy_namespace(xp): |
|
|
|
dtype = np.dtype(dtype) |
|
|
|
estimator_name = _check_estimator_name(estimator) |
|
context = " by %s" % estimator_name if estimator is not None else "" |
|
|
|
|
|
if hasattr(array, "sparse") and array.ndim > 1: |
|
with suppress(ImportError): |
|
from pandas import SparseDtype |
|
|
|
def is_sparse(dtype): |
|
return isinstance(dtype, SparseDtype) |
|
|
|
if array.dtypes.apply(is_sparse).all(): |
|
|
|
array = array.sparse.to_coo() |
|
if array.dtype == np.dtype("object"): |
|
unique_dtypes = set([dt.subtype.name for dt in array_orig.dtypes]) |
|
if len(unique_dtypes) > 1: |
|
raise ValueError( |
|
"Pandas DataFrame with mixed sparse extension arrays " |
|
"generated a sparse matrix with object dtype which " |
|
"can not be converted to a scipy sparse matrix." |
|
"Sparse extension arrays should all have the same " |
|
"numeric type." |
|
) |
|
|
|
if sp.issparse(array): |
|
_ensure_no_complex_data(array) |
|
array = _ensure_sparse_format( |
|
array, |
|
accept_sparse=accept_sparse, |
|
dtype=dtype, |
|
copy=copy, |
|
ensure_all_finite=ensure_all_finite, |
|
accept_large_sparse=accept_large_sparse, |
|
estimator_name=estimator_name, |
|
input_name=input_name, |
|
) |
|
if ensure_2d and array.ndim < 2: |
|
raise ValueError( |
|
f"Expected 2D input, got input with shape {array.shape}.\n" |
|
"Reshape your data either using array.reshape(-1, 1) if " |
|
"your data has a single feature or array.reshape(1, -1) " |
|
"if it contains a single sample." |
|
) |
|
else: |
|
|
|
|
|
|
|
|
|
|
|
with warnings.catch_warnings(): |
|
try: |
|
warnings.simplefilter("error", ComplexWarning) |
|
if dtype is not None and xp.isdtype(dtype, "integral"): |
|
|
|
|
|
|
|
array = _asarray_with_order(array, order=order, xp=xp) |
|
if xp.isdtype(array.dtype, ("real floating", "complex floating")): |
|
_assert_all_finite( |
|
array, |
|
allow_nan=False, |
|
msg_dtype=dtype, |
|
estimator_name=estimator_name, |
|
input_name=input_name, |
|
) |
|
array = xp.astype(array, dtype, copy=False) |
|
else: |
|
array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp) |
|
except ComplexWarning as complex_warning: |
|
raise ValueError( |
|
"Complex data not supported\n{}\n".format(array) |
|
) from complex_warning |
|
|
|
|
|
|
|
|
|
|
|
_ensure_no_complex_data(array) |
|
|
|
if ensure_2d: |
|
|
|
if array.ndim == 0: |
|
raise ValueError( |
|
"Expected 2D array, got scalar array instead:\narray={}.\n" |
|
"Reshape your data either using array.reshape(-1, 1) if " |
|
"your data has a single feature or array.reshape(1, -1) " |
|
"if it contains a single sample.".format(array) |
|
) |
|
|
|
if array.ndim == 1: |
|
|
|
if type_if_series is not None: |
|
msg = ( |
|
f"Expected a 2-dimensional container but got {type_if_series} " |
|
"instead. Pass a DataFrame containing a single row (i.e. " |
|
"single sample) or a single column (i.e. single feature) " |
|
"instead." |
|
) |
|
else: |
|
msg = ( |
|
f"Expected 2D array, got 1D array instead:\narray={array}.\n" |
|
"Reshape your data either using array.reshape(-1, 1) if " |
|
"your data has a single feature or array.reshape(1, -1) " |
|
"if it contains a single sample." |
|
) |
|
raise ValueError(msg) |
|
|
|
if dtype_numeric and hasattr(array.dtype, "kind") and array.dtype.kind in "USV": |
|
raise ValueError( |
|
"dtype='numeric' is not compatible with arrays of bytes/strings." |
|
"Convert your data to numeric values explicitly instead." |
|
) |
|
if not allow_nd and array.ndim >= 3: |
|
raise ValueError( |
|
"Found array with dim %d. %s expected <= 2." |
|
% (array.ndim, estimator_name) |
|
) |
|
|
|
if ensure_all_finite: |
|
_assert_all_finite( |
|
array, |
|
input_name=input_name, |
|
estimator_name=estimator_name, |
|
allow_nan=ensure_all_finite == "allow-nan", |
|
) |
|
|
|
if copy: |
|
if _is_numpy_namespace(xp): |
|
|
|
if np.may_share_memory(array, array_orig): |
|
array = _asarray_with_order( |
|
array, dtype=dtype, order=order, copy=True, xp=xp |
|
) |
|
else: |
|
|
|
array = _asarray_with_order( |
|
array, dtype=dtype, order=order, copy=True, xp=xp |
|
) |
|
|
|
if ensure_min_samples > 0: |
|
n_samples = _num_samples(array) |
|
if n_samples < ensure_min_samples: |
|
raise ValueError( |
|
"Found array with %d sample(s) (shape=%s) while a" |
|
" minimum of %d is required%s." |
|
% (n_samples, array.shape, ensure_min_samples, context) |
|
) |
|
|
|
if ensure_min_features > 0 and array.ndim == 2: |
|
n_features = array.shape[1] |
|
if n_features < ensure_min_features: |
|
raise ValueError( |
|
"Found array with %d feature(s) (shape=%s) while" |
|
" a minimum of %d is required%s." |
|
% (n_features, array.shape, ensure_min_features, context) |
|
) |
|
|
|
if ensure_non_negative: |
|
whom = input_name |
|
if estimator_name: |
|
whom += f" in {estimator_name}" |
|
check_non_negative(array, whom) |
|
|
|
if force_writeable: |
|
|
|
|
|
copy_params = {"order": "K"} if not sp.issparse(array) else {} |
|
|
|
array_data = array.data if sp.issparse(array) else array |
|
flags = getattr(array_data, "flags", None) |
|
if not getattr(flags, "writeable", True): |
|
|
|
|
|
|
|
|
|
|
|
|
|
if _is_pandas_df_or_series(array_orig): |
|
try: |
|
|
|
|
|
|
|
|
|
|
|
array_data.flags.writeable = True |
|
except ValueError: |
|
array = array.copy(**copy_params) |
|
else: |
|
array = array.copy(**copy_params) |
|
|
|
return array |
|
|
|
|
|
def _check_large_sparse(X, accept_large_sparse=False): |
|
"""Raise a ValueError if X has 64bit indices and accept_large_sparse=False""" |
|
if not accept_large_sparse: |
|
supported_indices = ["int32"] |
|
if X.format == "coo": |
|
index_keys = ["col", "row"] |
|
elif X.format in ["csr", "csc", "bsr"]: |
|
index_keys = ["indices", "indptr"] |
|
else: |
|
return |
|
for key in index_keys: |
|
indices_datatype = getattr(X, key).dtype |
|
if indices_datatype not in supported_indices: |
|
raise ValueError( |
|
"Only sparse matrices with 32-bit integer indices are accepted." |
|
f" Got {indices_datatype} indices. Please do report a minimal" |
|
" reproducer on scikit-learn issue tracker so that support for" |
|
" your use-case can be studied by maintainers. See:" |
|
" https://scikit-learn.org/dev/developers/minimal_reproducer.html" |
|
) |
|
|
|
|
|
def check_X_y( |
|
X, |
|
y, |
|
accept_sparse=False, |
|
*, |
|
accept_large_sparse=True, |
|
dtype="numeric", |
|
order=None, |
|
copy=False, |
|
force_writeable=False, |
|
force_all_finite="deprecated", |
|
ensure_all_finite=None, |
|
ensure_2d=True, |
|
allow_nd=False, |
|
multi_output=False, |
|
ensure_min_samples=1, |
|
ensure_min_features=1, |
|
y_numeric=False, |
|
estimator=None, |
|
): |
|
"""Input validation for standard estimators. |
|
|
|
Checks X and y for consistent length, enforces X to be 2D and y 1D. By |
|
default, X is checked to be non-empty and containing only finite values. |
|
Standard input checks are also applied to y, such as checking that y |
|
does not have np.nan or np.inf targets. For multi-label y, set |
|
multi_output=True to allow 2D and sparse y. If the dtype of X is |
|
object, attempt converting to float, raising on failure. |
|
|
|
Parameters |
|
---------- |
|
X : {ndarray, list, sparse matrix} |
|
Input data. |
|
|
|
y : {ndarray, list, sparse matrix} |
|
Labels. |
|
|
|
accept_sparse : str, bool or list of str, default=False |
|
String[s] representing allowed sparse matrix formats, such as 'csc', |
|
'csr', etc. If the input is sparse but not in the allowed format, |
|
it will be converted to the first listed format. True allows the input |
|
to be any format. False means that a sparse matrix input will |
|
raise an error. |
|
|
|
accept_large_sparse : bool, default=True |
|
If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by |
|
accept_sparse, accept_large_sparse will cause it to be accepted only |
|
if its indices are stored with a 32-bit dtype. |
|
|
|
.. versionadded:: 0.20 |
|
|
|
dtype : 'numeric', type, list of type or None, default='numeric' |
|
Data type of result. If None, the dtype of the input is preserved. |
|
If "numeric", dtype is preserved unless array.dtype is object. |
|
If dtype is a list of types, conversion on the first type is only |
|
performed if the dtype of the input is not in the list. |
|
|
|
order : {'F', 'C'}, default=None |
|
Whether an array will be forced to be fortran or c-style. If |
|
`None`, then the input data's order is preserved when possible. |
|
|
|
copy : bool, default=False |
|
Whether a forced copy will be triggered. If copy=False, a copy might |
|
be triggered by a conversion. |
|
|
|
force_writeable : bool, default=False |
|
Whether to force the output array to be writeable. If True, the returned array |
|
is guaranteed to be writeable, which may require a copy. Otherwise the |
|
writeability of the input array is preserved. |
|
|
|
.. versionadded:: 1.6 |
|
|
|
force_all_finite : bool or 'allow-nan', default=True |
|
Whether to raise an error on np.inf, np.nan, pd.NA in array. This parameter |
|
does not influence whether y can have np.inf, np.nan, pd.NA values. |
|
The possibilities are: |
|
|
|
- True: Force all values of X to be finite. |
|
- False: accepts np.inf, np.nan, pd.NA in X. |
|
- 'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot |
|
be infinite. |
|
|
|
.. versionadded:: 0.20 |
|
``force_all_finite`` accepts the string ``'allow-nan'``. |
|
|
|
.. versionchanged:: 0.23 |
|
Accepts `pd.NA` and converts it into `np.nan` |
|
|
|
.. deprecated:: 1.6 |
|
`force_all_finite` was renamed to `ensure_all_finite` and will be removed |
|
in 1.8. |
|
|
|
ensure_all_finite : bool or 'allow-nan', default=True |
|
Whether to raise an error on np.inf, np.nan, pd.NA in array. This parameter |
|
does not influence whether y can have np.inf, np.nan, pd.NA values. |
|
The possibilities are: |
|
|
|
- True: Force all values of X to be finite. |
|
- False: accepts np.inf, np.nan, pd.NA in X. |
|
- 'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot |
|
be infinite. |
|
|
|
.. versionadded:: 1.6 |
|
`force_all_finite` was renamed to `ensure_all_finite`. |
|
|
|
ensure_2d : bool, default=True |
|
Whether to raise a value error if X is not 2D. |
|
|
|
allow_nd : bool, default=False |
|
Whether to allow X.ndim > 2. |
|
|
|
multi_output : bool, default=False |
|
Whether to allow 2D y (array or sparse matrix). If false, y will be |
|
validated as a vector. y cannot have np.nan or np.inf values if |
|
multi_output=True. |
|
|
|
ensure_min_samples : int, default=1 |
|
Make sure that X has a minimum number of samples in its first |
|
axis (rows for a 2D array). |
|
|
|
ensure_min_features : int, default=1 |
|
Make sure that the 2D array has some minimum number of features |
|
(columns). The default value of 1 rejects empty datasets. |
|
This check is only enforced when X has effectively 2 dimensions or |
|
is originally 1D and ``ensure_2d`` is True. Setting to 0 disables |
|
this check. |
|
|
|
y_numeric : bool, default=False |
|
Whether to ensure that y has a numeric type. If dtype of y is object, |
|
it is converted to float64. Should only be used for regression |
|
algorithms. |
|
|
|
estimator : str or estimator instance, default=None |
|
If passed, include the name of the estimator in warning messages. |
|
|
|
Returns |
|
------- |
|
X_converted : object |
|
The converted and validated X. |
|
|
|
y_converted : object |
|
The converted and validated y. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.utils.validation import check_X_y |
|
>>> X = [[1, 2], [3, 4], [5, 6]] |
|
>>> y = [1, 2, 3] |
|
>>> X, y = check_X_y(X, y) |
|
>>> X |
|
array([[1, 2], |
|
[3, 4], |
|
[5, 6]]) |
|
>>> y |
|
array([1, 2, 3]) |
|
""" |
|
if y is None: |
|
if estimator is None: |
|
estimator_name = "estimator" |
|
else: |
|
estimator_name = _check_estimator_name(estimator) |
|
raise ValueError( |
|
f"{estimator_name} requires y to be passed, but the target y is None" |
|
) |
|
|
|
ensure_all_finite = _deprecate_force_all_finite(force_all_finite, ensure_all_finite) |
|
|
|
X = check_array( |
|
X, |
|
accept_sparse=accept_sparse, |
|
accept_large_sparse=accept_large_sparse, |
|
dtype=dtype, |
|
order=order, |
|
copy=copy, |
|
force_writeable=force_writeable, |
|
ensure_all_finite=ensure_all_finite, |
|
ensure_2d=ensure_2d, |
|
allow_nd=allow_nd, |
|
ensure_min_samples=ensure_min_samples, |
|
ensure_min_features=ensure_min_features, |
|
estimator=estimator, |
|
input_name="X", |
|
) |
|
|
|
y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator) |
|
|
|
check_consistent_length(X, y) |
|
|
|
return X, y |
|
|
|
|
|
def _check_y(y, multi_output=False, y_numeric=False, estimator=None): |
|
"""Isolated part of check_X_y dedicated to y validation""" |
|
if multi_output: |
|
y = check_array( |
|
y, |
|
accept_sparse="csr", |
|
ensure_all_finite=True, |
|
ensure_2d=False, |
|
dtype=None, |
|
input_name="y", |
|
estimator=estimator, |
|
) |
|
else: |
|
estimator_name = _check_estimator_name(estimator) |
|
y = column_or_1d(y, warn=True) |
|
_assert_all_finite(y, input_name="y", estimator_name=estimator_name) |
|
_ensure_no_complex_data(y) |
|
if y_numeric and hasattr(y.dtype, "kind") and y.dtype.kind == "O": |
|
y = y.astype(np.float64) |
|
|
|
return y |
|
|
|
|
|
def column_or_1d(y, *, dtype=None, warn=False, device=None): |
|
"""Ravel column or 1d numpy array, else raises an error. |
|
|
|
Parameters |
|
---------- |
|
y : array-like |
|
Input data. |
|
|
|
dtype : data-type, default=None |
|
Data type for `y`. |
|
|
|
.. versionadded:: 1.2 |
|
|
|
warn : bool, default=False |
|
To control display of warnings. |
|
|
|
device : device, default=None |
|
`device` object. |
|
See the :ref:`Array API User Guide <array_api>` for more details. |
|
|
|
.. versionadded:: 1.6 |
|
|
|
Returns |
|
------- |
|
y : ndarray |
|
Output data. |
|
|
|
Raises |
|
------ |
|
ValueError |
|
If `y` is not a 1D array or a 2D array with a single row or column. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.utils.validation import column_or_1d |
|
>>> column_or_1d([1, 1]) |
|
array([1, 1]) |
|
""" |
|
xp, _ = get_namespace(y) |
|
y = check_array( |
|
y, |
|
ensure_2d=False, |
|
dtype=dtype, |
|
input_name="y", |
|
ensure_all_finite=False, |
|
ensure_min_samples=0, |
|
) |
|
|
|
shape = y.shape |
|
if len(shape) == 1: |
|
return _asarray_with_order( |
|
xp.reshape(y, (-1,)), order="C", xp=xp, device=device |
|
) |
|
if len(shape) == 2 and shape[1] == 1: |
|
if warn: |
|
warnings.warn( |
|
( |
|
"A column-vector y was passed when a 1d array was" |
|
" expected. Please change the shape of y to " |
|
"(n_samples, ), for example using ravel()." |
|
), |
|
DataConversionWarning, |
|
stacklevel=2, |
|
) |
|
return _asarray_with_order( |
|
xp.reshape(y, (-1,)), order="C", xp=xp, device=device |
|
) |
|
|
|
raise ValueError( |
|
"y should be a 1d array, got an array of shape {} instead.".format(shape) |
|
) |
|
|
|
|
|
def check_random_state(seed): |
|
"""Turn seed into a np.random.RandomState instance. |
|
|
|
Parameters |
|
---------- |
|
seed : None, int or instance of RandomState |
|
If seed is None, return the RandomState singleton used by np.random. |
|
If seed is an int, return a new RandomState instance seeded with seed. |
|
If seed is already a RandomState instance, return it. |
|
Otherwise raise ValueError. |
|
|
|
Returns |
|
------- |
|
:class:`numpy:numpy.random.RandomState` |
|
The random state object based on `seed` parameter. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.utils.validation import check_random_state |
|
>>> check_random_state(42) |
|
RandomState(MT19937) at 0x... |
|
""" |
|
if seed is None or seed is np.random: |
|
return np.random.mtrand._rand |
|
if isinstance(seed, numbers.Integral): |
|
return np.random.RandomState(seed) |
|
if isinstance(seed, np.random.RandomState): |
|
return seed |
|
raise ValueError( |
|
"%r cannot be used to seed a numpy.random.RandomState instance" % seed |
|
) |
|
|
|
|
|
def has_fit_parameter(estimator, parameter): |
|
"""Check whether the estimator's fit method supports the given parameter. |
|
|
|
Parameters |
|
---------- |
|
estimator : object |
|
An estimator to inspect. |
|
|
|
parameter : str |
|
The searched parameter. |
|
|
|
Returns |
|
------- |
|
is_parameter : bool |
|
Whether the parameter was found to be a named parameter of the |
|
estimator's fit method. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.svm import SVC |
|
>>> from sklearn.utils.validation import has_fit_parameter |
|
>>> has_fit_parameter(SVC(), "sample_weight") |
|
True |
|
""" |
|
return ( |
|
|
|
|
|
|
|
|
|
hasattr(estimator, "fit") |
|
and parameter in signature(estimator.fit).parameters |
|
) |
|
|
|
|
|
def check_symmetric(array, *, tol=1e-10, raise_warning=True, raise_exception=False): |
|
"""Make sure that array is 2D, square and symmetric. |
|
|
|
If the array is not symmetric, then a symmetrized version is returned. |
|
Optionally, a warning or exception is raised if the matrix is not |
|
symmetric. |
|
|
|
Parameters |
|
---------- |
|
array : {ndarray, sparse matrix} |
|
Input object to check / convert. Must be two-dimensional and square, |
|
otherwise a ValueError will be raised. |
|
|
|
tol : float, default=1e-10 |
|
Absolute tolerance for equivalence of arrays. Default = 1E-10. |
|
|
|
raise_warning : bool, default=True |
|
If True then raise a warning if conversion is required. |
|
|
|
raise_exception : bool, default=False |
|
If True then raise an exception if array is not symmetric. |
|
|
|
Returns |
|
------- |
|
array_sym : {ndarray, sparse matrix} |
|
Symmetrized version of the input array, i.e. the average of array |
|
and array.transpose(). If sparse, then duplicate entries are first |
|
summed and zeros are eliminated. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.utils.validation import check_symmetric |
|
>>> symmetric_array = np.array([[0, 1, 2], [1, 0, 1], [2, 1, 0]]) |
|
>>> check_symmetric(symmetric_array) |
|
array([[0, 1, 2], |
|
[1, 0, 1], |
|
[2, 1, 0]]) |
|
>>> from scipy.sparse import csr_matrix |
|
>>> sparse_symmetric_array = csr_matrix(symmetric_array) |
|
>>> check_symmetric(sparse_symmetric_array) |
|
<Compressed Sparse Row sparse matrix of dtype 'int64' |
|
with 6 stored elements and shape (3, 3)> |
|
""" |
|
if (array.ndim != 2) or (array.shape[0] != array.shape[1]): |
|
raise ValueError( |
|
"array must be 2-dimensional and square. shape = {0}".format(array.shape) |
|
) |
|
|
|
if sp.issparse(array): |
|
diff = array - array.T |
|
|
|
if diff.format not in ["csr", "csc", "coo"]: |
|
diff = diff.tocsr() |
|
symmetric = np.all(abs(diff.data) < tol) |
|
else: |
|
symmetric = np.allclose(array, array.T, atol=tol) |
|
|
|
if not symmetric: |
|
if raise_exception: |
|
raise ValueError("Array must be symmetric") |
|
if raise_warning: |
|
warnings.warn( |
|
( |
|
"Array is not symmetric, and will be converted " |
|
"to symmetric by average with its transpose." |
|
), |
|
stacklevel=2, |
|
) |
|
if sp.issparse(array): |
|
conversion = "to" + array.format |
|
array = getattr(0.5 * (array + array.T), conversion)() |
|
else: |
|
array = 0.5 * (array + array.T) |
|
|
|
return array |
|
|
|
|
|
def _is_fitted(estimator, attributes=None, all_or_any=all): |
|
"""Determine if an estimator is fitted |
|
|
|
Parameters |
|
---------- |
|
estimator : estimator instance |
|
Estimator instance for which the check is performed. |
|
|
|
attributes : str, list or tuple of str, default=None |
|
Attribute name(s) given as string or a list/tuple of strings |
|
Eg.: ``["coef_", "estimator_", ...], "coef_"`` |
|
|
|
If `None`, `estimator` is considered fitted if there exist an |
|
attribute that ends with a underscore and does not start with double |
|
underscore. |
|
|
|
all_or_any : callable, {all, any}, default=all |
|
Specify whether all or any of the given attributes must exist. |
|
|
|
Returns |
|
------- |
|
fitted : bool |
|
Whether the estimator is fitted. |
|
""" |
|
if attributes is not None: |
|
if not isinstance(attributes, (list, tuple)): |
|
attributes = [attributes] |
|
return all_or_any([hasattr(estimator, attr) for attr in attributes]) |
|
|
|
if hasattr(estimator, "__sklearn_is_fitted__"): |
|
return estimator.__sklearn_is_fitted__() |
|
|
|
fitted_attrs = [ |
|
v for v in vars(estimator) if v.endswith("_") and not v.startswith("__") |
|
] |
|
return len(fitted_attrs) > 0 |
|
|
|
|
|
def check_is_fitted(estimator, attributes=None, *, msg=None, all_or_any=all): |
|
"""Perform is_fitted validation for estimator. |
|
|
|
Checks if the estimator is fitted by verifying the presence of |
|
fitted attributes (ending with a trailing underscore) and otherwise |
|
raises a :class:`~sklearn.exceptions.NotFittedError` with the given message. |
|
|
|
If an estimator does not set any attributes with a trailing underscore, it |
|
can define a ``__sklearn_is_fitted__`` method returning a boolean to |
|
specify if the estimator is fitted or not. See |
|
:ref:`sphx_glr_auto_examples_developing_estimators_sklearn_is_fitted.py` |
|
for an example on how to use the API. |
|
|
|
If no `attributes` are passed, this fuction will pass if an estimator is stateless. |
|
An estimator can indicate it's stateless by setting the `requires_fit` tag. See |
|
:ref:`estimator_tags` for more information. Note that the `requires_fit` tag |
|
is ignored if `attributes` are passed. |
|
|
|
Parameters |
|
---------- |
|
estimator : estimator instance |
|
Estimator instance for which the check is performed. |
|
|
|
attributes : str, list or tuple of str, default=None |
|
Attribute name(s) given as string or a list/tuple of strings |
|
Eg.: ``["coef_", "estimator_", ...], "coef_"`` |
|
|
|
If `None`, `estimator` is considered fitted if there exist an |
|
attribute that ends with a underscore and does not start with double |
|
underscore. |
|
|
|
msg : str, default=None |
|
The default error message is, "This %(name)s instance is not fitted |
|
yet. Call 'fit' with appropriate arguments before using this |
|
estimator." |
|
|
|
For custom messages if "%(name)s" is present in the message string, |
|
it is substituted for the estimator name. |
|
|
|
Eg. : "Estimator, %(name)s, must be fitted before sparsifying". |
|
|
|
all_or_any : callable, {all, any}, default=all |
|
Specify whether all or any of the given attributes must exist. |
|
|
|
Raises |
|
------ |
|
TypeError |
|
If the estimator is a class or not an estimator instance |
|
|
|
NotFittedError |
|
If the attributes are not found. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.linear_model import LogisticRegression |
|
>>> from sklearn.utils.validation import check_is_fitted |
|
>>> from sklearn.exceptions import NotFittedError |
|
>>> lr = LogisticRegression() |
|
>>> try: |
|
... check_is_fitted(lr) |
|
... except NotFittedError as exc: |
|
... print(f"Model is not fitted yet.") |
|
Model is not fitted yet. |
|
>>> lr.fit([[1, 2], [1, 3]], [1, 0]) |
|
LogisticRegression() |
|
>>> check_is_fitted(lr) |
|
""" |
|
if isclass(estimator): |
|
raise TypeError("{} is a class, not an instance.".format(estimator)) |
|
if msg is None: |
|
msg = ( |
|
"This %(name)s instance is not fitted yet. Call 'fit' with " |
|
"appropriate arguments before using this estimator." |
|
) |
|
|
|
if not hasattr(estimator, "fit"): |
|
raise TypeError("%s is not an estimator instance." % (estimator)) |
|
|
|
tags = get_tags(estimator) |
|
|
|
if not tags.requires_fit and attributes is None: |
|
return |
|
|
|
if not _is_fitted(estimator, attributes, all_or_any): |
|
raise NotFittedError(msg % {"name": type(estimator).__name__}) |
|
|
|
|
|
def _estimator_has(attr, *, delegates=("estimator_", "estimator")): |
|
"""Check if we can delegate a method to the underlying estimator. |
|
|
|
We check the `delegates` in the order they are passed. By default, we first check |
|
the fitted estimator if available, otherwise we check the unfitted estimator. |
|
|
|
Parameters |
|
---------- |
|
attr : str |
|
Name of the attribute the delegate might or might not have. |
|
|
|
delegates: tuple of str, default=("estimator_", "estimator") |
|
A tuple of sub-estimator(s) to check if we can delegate the `attr` method. |
|
|
|
Returns |
|
------- |
|
check : function |
|
Function to check if the delegate has the attribute. |
|
|
|
Raises |
|
------ |
|
ValueError |
|
Raised when none of the delegates are present in the object. |
|
""" |
|
|
|
def check(self): |
|
for delegate in delegates: |
|
|
|
|
|
|
|
if hasattr(self, delegate): |
|
delegator = getattr(self, delegate) |
|
if isinstance(delegator, Sequence): |
|
return getattr(delegator[0], attr) |
|
else: |
|
return getattr(delegator, attr) |
|
|
|
raise ValueError(f"None of the delegates {delegates} are present in the class.") |
|
|
|
return check |
|
|
|
|
|
def check_non_negative(X, whom): |
|
""" |
|
Check if there is any negative value in an array. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} |
|
Input data. |
|
|
|
whom : str |
|
Who passed X to this function. |
|
""" |
|
xp, _ = get_namespace(X) |
|
|
|
if sp.issparse(X): |
|
if X.format in ["lil", "dok"]: |
|
X = X.tocsr() |
|
if X.data.size == 0: |
|
X_min = 0 |
|
else: |
|
X_min = X.data.min() |
|
else: |
|
X_min = xp.min(X) |
|
|
|
if X_min < 0: |
|
raise ValueError(f"Negative values in data passed to {whom}.") |
|
|
|
|
|
def check_scalar( |
|
x, |
|
name, |
|
target_type, |
|
*, |
|
min_val=None, |
|
max_val=None, |
|
include_boundaries="both", |
|
): |
|
"""Validate scalar parameters type and value. |
|
|
|
Parameters |
|
---------- |
|
x : object |
|
The scalar parameter to validate. |
|
|
|
name : str |
|
The name of the parameter to be printed in error messages. |
|
|
|
target_type : type or tuple |
|
Acceptable data types for the parameter. |
|
|
|
min_val : float or int, default=None |
|
The minimum valid value the parameter can take. If None (default) it |
|
is implied that the parameter does not have a lower bound. |
|
|
|
max_val : float or int, default=None |
|
The maximum valid value the parameter can take. If None (default) it |
|
is implied that the parameter does not have an upper bound. |
|
|
|
include_boundaries : {"left", "right", "both", "neither"}, default="both" |
|
Whether the interval defined by `min_val` and `max_val` should include |
|
the boundaries. Possible choices are: |
|
|
|
- `"left"`: only `min_val` is included in the valid interval. |
|
It is equivalent to the interval `[ min_val, max_val )`. |
|
- `"right"`: only `max_val` is included in the valid interval. |
|
It is equivalent to the interval `( min_val, max_val ]`. |
|
- `"both"`: `min_val` and `max_val` are included in the valid interval. |
|
It is equivalent to the interval `[ min_val, max_val ]`. |
|
- `"neither"`: neither `min_val` nor `max_val` are included in the |
|
valid interval. It is equivalent to the interval `( min_val, max_val )`. |
|
|
|
Returns |
|
------- |
|
x : numbers.Number |
|
The validated number. |
|
|
|
Raises |
|
------ |
|
TypeError |
|
If the parameter's type does not match the desired type. |
|
|
|
ValueError |
|
If the parameter's value violates the given bounds. |
|
If `min_val`, `max_val` and `include_boundaries` are inconsistent. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.utils.validation import check_scalar |
|
>>> check_scalar(10, "x", int, min_val=1, max_val=20) |
|
10 |
|
""" |
|
|
|
def type_name(t): |
|
"""Convert type into humman readable string.""" |
|
module = t.__module__ |
|
qualname = t.__qualname__ |
|
if module == "builtins": |
|
return qualname |
|
elif t == numbers.Real: |
|
return "float" |
|
elif t == numbers.Integral: |
|
return "int" |
|
return f"{module}.{qualname}" |
|
|
|
if not isinstance(x, target_type): |
|
if isinstance(target_type, tuple): |
|
types_str = ", ".join(type_name(t) for t in target_type) |
|
target_type_str = f"{{{types_str}}}" |
|
else: |
|
target_type_str = type_name(target_type) |
|
|
|
raise TypeError( |
|
f"{name} must be an instance of {target_type_str}, not" |
|
f" {type(x).__qualname__}." |
|
) |
|
|
|
expected_include_boundaries = ("left", "right", "both", "neither") |
|
if include_boundaries not in expected_include_boundaries: |
|
raise ValueError( |
|
f"Unknown value for `include_boundaries`: {repr(include_boundaries)}. " |
|
f"Possible values are: {expected_include_boundaries}." |
|
) |
|
|
|
if max_val is None and include_boundaries == "right": |
|
raise ValueError( |
|
"`include_boundaries`='right' without specifying explicitly `max_val` " |
|
"is inconsistent." |
|
) |
|
|
|
if min_val is None and include_boundaries == "left": |
|
raise ValueError( |
|
"`include_boundaries`='left' without specifying explicitly `min_val` " |
|
"is inconsistent." |
|
) |
|
|
|
comparison_operator = ( |
|
operator.lt if include_boundaries in ("left", "both") else operator.le |
|
) |
|
if min_val is not None and comparison_operator(x, min_val): |
|
raise ValueError( |
|
f"{name} == {x}, must be" |
|
f" {'>=' if include_boundaries in ('left', 'both') else '>'} {min_val}." |
|
) |
|
|
|
comparison_operator = ( |
|
operator.gt if include_boundaries in ("right", "both") else operator.ge |
|
) |
|
if max_val is not None and comparison_operator(x, max_val): |
|
raise ValueError( |
|
f"{name} == {x}, must be" |
|
f" {'<=' if include_boundaries in ('right', 'both') else '<'} {max_val}." |
|
) |
|
|
|
return x |
|
|
|
|
|
def _check_psd_eigenvalues(lambdas, enable_warnings=False): |
|
"""Check the eigenvalues of a positive semidefinite (PSD) matrix. |
|
|
|
Checks the provided array of PSD matrix eigenvalues for numerical or |
|
conditioning issues and returns a fixed validated version. This method |
|
should typically be used if the PSD matrix is user-provided (e.g. a |
|
Gram matrix) or computed using a user-provided dissimilarity metric |
|
(e.g. kernel function), or if the decomposition process uses approximation |
|
methods (randomized SVD, etc.). |
|
|
|
It checks for three things: |
|
|
|
- that there are no significant imaginary parts in eigenvalues (more than |
|
1e-5 times the maximum real part). If this check fails, it raises a |
|
``ValueError``. Otherwise all non-significant imaginary parts that may |
|
remain are set to zero. This operation is traced with a |
|
``PositiveSpectrumWarning`` when ``enable_warnings=True``. |
|
|
|
- that eigenvalues are not all negative. If this check fails, it raises a |
|
``ValueError`` |
|
|
|
- that there are no significant negative eigenvalues with absolute value |
|
more than 1e-10 (1e-6) and more than 1e-5 (5e-3) times the largest |
|
positive eigenvalue in double (simple) precision. If this check fails, |
|
it raises a ``ValueError``. Otherwise all negative eigenvalues that may |
|
remain are set to zero. This operation is traced with a |
|
``PositiveSpectrumWarning`` when ``enable_warnings=True``. |
|
|
|
Finally, all the positive eigenvalues that are too small (with a value |
|
smaller than the maximum eigenvalue multiplied by 1e-12 (2e-7)) are set to |
|
zero. This operation is traced with a ``PositiveSpectrumWarning`` when |
|
``enable_warnings=True``. |
|
|
|
Parameters |
|
---------- |
|
lambdas : array-like of shape (n_eigenvalues,) |
|
Array of eigenvalues to check / fix. |
|
|
|
enable_warnings : bool, default=False |
|
When this is set to ``True``, a ``PositiveSpectrumWarning`` will be |
|
raised when there are imaginary parts, negative eigenvalues, or |
|
extremely small non-zero eigenvalues. Otherwise no warning will be |
|
raised. In both cases, imaginary parts, negative eigenvalues, and |
|
extremely small non-zero eigenvalues will be set to zero. |
|
|
|
Returns |
|
------- |
|
lambdas_fixed : ndarray of shape (n_eigenvalues,) |
|
A fixed validated copy of the array of eigenvalues. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.utils.validation import _check_psd_eigenvalues |
|
>>> _check_psd_eigenvalues([1, 2]) # nominal case |
|
array([1, 2]) |
|
>>> _check_psd_eigenvalues([5, 5j]) # significant imag part |
|
Traceback (most recent call last): |
|
... |
|
ValueError: There are significant imaginary parts in eigenvalues (1 |
|
of the maximum real part). Either the matrix is not PSD, or there was |
|
an issue while computing the eigendecomposition of the matrix. |
|
>>> _check_psd_eigenvalues([5, 5e-5j]) # insignificant imag part |
|
array([5., 0.]) |
|
>>> _check_psd_eigenvalues([-5, -1]) # all negative |
|
Traceback (most recent call last): |
|
... |
|
ValueError: All eigenvalues are negative (maximum is -1). Either the |
|
matrix is not PSD, or there was an issue while computing the |
|
eigendecomposition of the matrix. |
|
>>> _check_psd_eigenvalues([5, -1]) # significant negative |
|
Traceback (most recent call last): |
|
... |
|
ValueError: There are significant negative eigenvalues (0.2 of the |
|
maximum positive). Either the matrix is not PSD, or there was an issue |
|
while computing the eigendecomposition of the matrix. |
|
>>> _check_psd_eigenvalues([5, -5e-5]) # insignificant negative |
|
array([5., 0.]) |
|
>>> _check_psd_eigenvalues([5, 4e-12]) # bad conditioning (too small) |
|
array([5., 0.]) |
|
|
|
""" |
|
|
|
lambdas = np.array(lambdas) |
|
is_double_precision = lambdas.dtype == np.float64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
significant_imag_ratio = 1e-5 |
|
significant_neg_ratio = 1e-5 if is_double_precision else 5e-3 |
|
significant_neg_value = 1e-10 if is_double_precision else 1e-6 |
|
small_pos_ratio = 1e-12 if is_double_precision else 2e-7 |
|
|
|
|
|
if not np.isreal(lambdas).all(): |
|
max_imag_abs = np.abs(np.imag(lambdas)).max() |
|
max_real_abs = np.abs(np.real(lambdas)).max() |
|
if max_imag_abs > significant_imag_ratio * max_real_abs: |
|
raise ValueError( |
|
"There are significant imaginary parts in eigenvalues (%g " |
|
"of the maximum real part). Either the matrix is not PSD, or " |
|
"there was an issue while computing the eigendecomposition " |
|
"of the matrix." % (max_imag_abs / max_real_abs) |
|
) |
|
|
|
|
|
if enable_warnings: |
|
warnings.warn( |
|
"There are imaginary parts in eigenvalues (%g " |
|
"of the maximum real part). Either the matrix is not" |
|
" PSD, or there was an issue while computing the " |
|
"eigendecomposition of the matrix. Only the real " |
|
"parts will be kept." % (max_imag_abs / max_real_abs), |
|
PositiveSpectrumWarning, |
|
) |
|
|
|
|
|
lambdas = np.real(lambdas) |
|
|
|
|
|
max_eig = lambdas.max() |
|
if max_eig < 0: |
|
raise ValueError( |
|
"All eigenvalues are negative (maximum is %g). " |
|
"Either the matrix is not PSD, or there was an " |
|
"issue while computing the eigendecomposition of " |
|
"the matrix." % max_eig |
|
) |
|
|
|
else: |
|
min_eig = lambdas.min() |
|
if ( |
|
min_eig < -significant_neg_ratio * max_eig |
|
and min_eig < -significant_neg_value |
|
): |
|
raise ValueError( |
|
"There are significant negative eigenvalues (%g" |
|
" of the maximum positive). Either the matrix is " |
|
"not PSD, or there was an issue while computing " |
|
"the eigendecomposition of the matrix." % (-min_eig / max_eig) |
|
) |
|
elif min_eig < 0: |
|
|
|
if enable_warnings: |
|
warnings.warn( |
|
"There are negative eigenvalues (%g of the " |
|
"maximum positive). Either the matrix is not " |
|
"PSD, or there was an issue while computing the" |
|
" eigendecomposition of the matrix. Negative " |
|
"eigenvalues will be replaced with 0." % (-min_eig / max_eig), |
|
PositiveSpectrumWarning, |
|
) |
|
lambdas[lambdas < 0] = 0 |
|
|
|
|
|
too_small_lambdas = (0 < lambdas) & (lambdas < small_pos_ratio * max_eig) |
|
if too_small_lambdas.any(): |
|
if enable_warnings: |
|
warnings.warn( |
|
"Badly conditioned PSD matrix spectrum: the largest " |
|
"eigenvalue is more than %g times the smallest. " |
|
"Small eigenvalues will be replaced with 0." |
|
"" % (1 / small_pos_ratio), |
|
PositiveSpectrumWarning, |
|
) |
|
lambdas[too_small_lambdas] = 0 |
|
|
|
return lambdas |
|
|
|
|
|
def _check_sample_weight( |
|
sample_weight, X, dtype=None, copy=False, ensure_non_negative=False |
|
): |
|
"""Validate sample weights. |
|
|
|
Note that passing sample_weight=None will output an array of ones. |
|
Therefore, in some cases, you may want to protect the call with: |
|
if sample_weight is not None: |
|
sample_weight = _check_sample_weight(...) |
|
|
|
Parameters |
|
---------- |
|
sample_weight : {ndarray, Number or None}, shape (n_samples,) |
|
Input sample weights. |
|
|
|
X : {ndarray, list, sparse matrix} |
|
Input data. |
|
|
|
ensure_non_negative : bool, default=False, |
|
Whether or not the weights are expected to be non-negative. |
|
|
|
.. versionadded:: 1.0 |
|
|
|
dtype : dtype, default=None |
|
dtype of the validated `sample_weight`. |
|
If None, and the input `sample_weight` is an array, the dtype of the |
|
input is preserved; otherwise an array with the default numpy dtype |
|
is be allocated. If `dtype` is not one of `float32`, `float64`, |
|
`None`, the output will be of dtype `float64`. |
|
|
|
copy : bool, default=False |
|
If True, a copy of sample_weight will be created. |
|
|
|
Returns |
|
------- |
|
sample_weight : ndarray of shape (n_samples,) |
|
Validated sample weight. It is guaranteed to be "C" contiguous. |
|
""" |
|
n_samples = _num_samples(X) |
|
|
|
if dtype is not None and dtype not in [np.float32, np.float64]: |
|
dtype = np.float64 |
|
|
|
if sample_weight is None: |
|
sample_weight = np.ones(n_samples, dtype=dtype) |
|
elif isinstance(sample_weight, numbers.Number): |
|
sample_weight = np.full(n_samples, sample_weight, dtype=dtype) |
|
else: |
|
if dtype is None: |
|
dtype = [np.float64, np.float32] |
|
sample_weight = check_array( |
|
sample_weight, |
|
accept_sparse=False, |
|
ensure_2d=False, |
|
dtype=dtype, |
|
order="C", |
|
copy=copy, |
|
input_name="sample_weight", |
|
) |
|
if sample_weight.ndim != 1: |
|
raise ValueError("Sample weights must be 1D array or scalar") |
|
|
|
if sample_weight.shape != (n_samples,): |
|
raise ValueError( |
|
"sample_weight.shape == {}, expected {}!".format( |
|
sample_weight.shape, (n_samples,) |
|
) |
|
) |
|
|
|
if ensure_non_negative: |
|
check_non_negative(sample_weight, "`sample_weight`") |
|
|
|
return sample_weight |
|
|
|
|
|
def _allclose_dense_sparse(x, y, rtol=1e-7, atol=1e-9): |
|
"""Check allclose for sparse and dense data. |
|
|
|
Both x and y need to be either sparse or dense, they |
|
can't be mixed. |
|
|
|
Parameters |
|
---------- |
|
x : {array-like, sparse matrix} |
|
First array to compare. |
|
|
|
y : {array-like, sparse matrix} |
|
Second array to compare. |
|
|
|
rtol : float, default=1e-7 |
|
Relative tolerance; see numpy.allclose. |
|
|
|
atol : float, default=1e-9 |
|
absolute tolerance; see numpy.allclose. Note that the default here is |
|
more tolerant than the default for numpy.testing.assert_allclose, where |
|
atol=0. |
|
""" |
|
if sp.issparse(x) and sp.issparse(y): |
|
x = x.tocsr() |
|
y = y.tocsr() |
|
x.sum_duplicates() |
|
y.sum_duplicates() |
|
return ( |
|
np.array_equal(x.indices, y.indices) |
|
and np.array_equal(x.indptr, y.indptr) |
|
and np.allclose(x.data, y.data, rtol=rtol, atol=atol) |
|
) |
|
elif not sp.issparse(x) and not sp.issparse(y): |
|
return np.allclose(x, y, rtol=rtol, atol=atol) |
|
raise ValueError( |
|
"Can only compare two sparse matrices, not a sparse matrix and an array" |
|
) |
|
|
|
|
|
def _check_response_method(estimator, response_method): |
|
"""Check if `response_method` is available in estimator and return it. |
|
|
|
.. versionadded:: 1.3 |
|
|
|
Parameters |
|
---------- |
|
estimator : estimator instance |
|
Classifier or regressor to check. |
|
|
|
response_method : {"predict_proba", "predict_log_proba", "decision_function", |
|
"predict"} or list of such str |
|
Specifies the response method to use get prediction from an estimator |
|
(i.e. :term:`predict_proba`, :term:`predict_log_proba`, |
|
:term:`decision_function` or :term:`predict`). Possible choices are: |
|
- if `str`, it corresponds to the name to the method to return; |
|
- if a list of `str`, it provides the method names in order of |
|
preference. The method returned corresponds to the first method in |
|
the list and which is implemented by `estimator`. |
|
|
|
Returns |
|
------- |
|
prediction_method : callable |
|
Prediction method of estimator. |
|
|
|
Raises |
|
------ |
|
AttributeError |
|
If `response_method` is not available in `estimator`. |
|
""" |
|
if isinstance(response_method, str): |
|
list_methods = [response_method] |
|
else: |
|
list_methods = response_method |
|
|
|
prediction_method = [getattr(estimator, method, None) for method in list_methods] |
|
prediction_method = reduce(lambda x, y: x or y, prediction_method) |
|
if prediction_method is None: |
|
raise AttributeError( |
|
f"{estimator.__class__.__name__} has none of the following attributes: " |
|
f"{', '.join(list_methods)}." |
|
) |
|
|
|
return prediction_method |
|
|
|
|
|
def _check_method_params(X, params, indices=None): |
|
"""Check and validate the parameters passed to a specific |
|
method like `fit`. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features) |
|
Data array. |
|
|
|
params : dict |
|
Dictionary containing the parameters passed to the method. |
|
|
|
indices : array-like of shape (n_samples,), default=None |
|
Indices to be selected if the parameter has the same size as `X`. |
|
|
|
Returns |
|
------- |
|
method_params_validated : dict |
|
Validated parameters. We ensure that the values support indexing. |
|
""" |
|
from . import _safe_indexing |
|
|
|
method_params_validated = {} |
|
for param_key, param_value in params.items(): |
|
if ( |
|
not _is_arraylike(param_value) |
|
and not sp.issparse(param_value) |
|
or _num_samples(param_value) != _num_samples(X) |
|
): |
|
|
|
|
|
method_params_validated[param_key] = param_value |
|
else: |
|
|
|
|
|
method_params_validated[param_key] = _make_indexable(param_value) |
|
method_params_validated[param_key] = _safe_indexing( |
|
method_params_validated[param_key], indices |
|
) |
|
|
|
return method_params_validated |
|
|
|
|
|
def _is_pandas_df_or_series(X): |
|
"""Return True if the X is a pandas dataframe or series.""" |
|
try: |
|
pd = sys.modules["pandas"] |
|
except KeyError: |
|
return False |
|
return isinstance(X, (pd.DataFrame, pd.Series)) |
|
|
|
|
|
def _is_pandas_df(X): |
|
"""Return True if the X is a pandas dataframe.""" |
|
try: |
|
pd = sys.modules["pandas"] |
|
except KeyError: |
|
return False |
|
return isinstance(X, pd.DataFrame) |
|
|
|
|
|
def _is_polars_df_or_series(X): |
|
"""Return True if the X is a polars dataframe or series.""" |
|
try: |
|
pl = sys.modules["polars"] |
|
except KeyError: |
|
return False |
|
return isinstance(X, (pl.DataFrame, pl.Series)) |
|
|
|
|
|
def _is_polars_df(X): |
|
"""Return True if the X is a polars dataframe.""" |
|
try: |
|
pl = sys.modules["polars"] |
|
except KeyError: |
|
return False |
|
return isinstance(X, pl.DataFrame) |
|
|
|
|
|
def _get_feature_names(X): |
|
"""Get feature names from X. |
|
|
|
Support for other array containers should place its implementation here. |
|
|
|
Parameters |
|
---------- |
|
X : {ndarray, dataframe} of shape (n_samples, n_features) |
|
Array container to extract feature names. |
|
|
|
- pandas dataframe : The columns will be considered to be feature |
|
names. If the dataframe contains non-string feature names, `None` is |
|
returned. |
|
- All other array containers will return `None`. |
|
|
|
Returns |
|
------- |
|
names: ndarray or None |
|
Feature names of `X`. Unrecognized array containers will return `None`. |
|
""" |
|
feature_names = None |
|
|
|
|
|
if _is_pandas_df(X): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
feature_names = np.asarray(X.columns, dtype=object) |
|
elif hasattr(X, "__dataframe__"): |
|
df_protocol = X.__dataframe__() |
|
feature_names = np.asarray(list(df_protocol.column_names()), dtype=object) |
|
|
|
if feature_names is None or len(feature_names) == 0: |
|
return |
|
|
|
types = sorted(t.__qualname__ for t in set(type(v) for v in feature_names)) |
|
|
|
|
|
if len(types) > 1 and "str" in types: |
|
raise TypeError( |
|
"Feature names are only supported if all input features have string names, " |
|
f"but your input has {types} as feature name / column name types. " |
|
"If you want feature names to be stored and validated, you must convert " |
|
"them all to strings, by using X.columns = X.columns.astype(str) for " |
|
"example. Otherwise you can remove feature / column names from your input " |
|
"data, or convert them all to a non-string data type." |
|
) |
|
|
|
|
|
if len(types) == 1 and types[0] == "str": |
|
return feature_names |
|
|
|
|
|
def _check_feature_names_in(estimator, input_features=None, *, generate_names=True): |
|
"""Check `input_features` and generate names if needed. |
|
|
|
Commonly used in :term:`get_feature_names_out`. |
|
|
|
Parameters |
|
---------- |
|
input_features : array-like of str or None, default=None |
|
Input features. |
|
|
|
- If `input_features` is `None`, then `feature_names_in_` is |
|
used as feature names in. If `feature_names_in_` is not defined, |
|
then the following input feature names are generated: |
|
`["x0", "x1", ..., "x(n_features_in_ - 1)"]`. |
|
- If `input_features` is an array-like, then `input_features` must |
|
match `feature_names_in_` if `feature_names_in_` is defined. |
|
|
|
generate_names : bool, default=True |
|
Whether to generate names when `input_features` is `None` and |
|
`estimator.feature_names_in_` is not defined. This is useful for transformers |
|
that validates `input_features` but do not require them in |
|
:term:`get_feature_names_out` e.g. `PCA`. |
|
|
|
Returns |
|
------- |
|
feature_names_in : ndarray of str or `None` |
|
Feature names in. |
|
""" |
|
|
|
feature_names_in_ = getattr(estimator, "feature_names_in_", None) |
|
n_features_in_ = getattr(estimator, "n_features_in_", None) |
|
|
|
if input_features is not None: |
|
input_features = np.asarray(input_features, dtype=object) |
|
if feature_names_in_ is not None and not np.array_equal( |
|
feature_names_in_, input_features |
|
): |
|
raise ValueError("input_features is not equal to feature_names_in_") |
|
|
|
if n_features_in_ is not None and len(input_features) != n_features_in_: |
|
raise ValueError( |
|
"input_features should have length equal to number of " |
|
f"features ({n_features_in_}), got {len(input_features)}" |
|
) |
|
return input_features |
|
|
|
if feature_names_in_ is not None: |
|
return feature_names_in_ |
|
|
|
if not generate_names: |
|
return |
|
|
|
|
|
if n_features_in_ is None: |
|
raise ValueError("Unable to generate feature names without n_features_in_") |
|
|
|
return np.asarray([f"x{i}" for i in range(n_features_in_)], dtype=object) |
|
|
|
|
|
def _generate_get_feature_names_out(estimator, n_features_out, input_features=None): |
|
"""Generate feature names out for estimator using the estimator name as the prefix. |
|
|
|
The input_feature names are validated but not used. This function is useful |
|
for estimators that generate their own names based on `n_features_out`, i.e. PCA. |
|
|
|
Parameters |
|
---------- |
|
estimator : estimator instance |
|
Estimator producing output feature names. |
|
|
|
n_feature_out : int |
|
Number of feature names out. |
|
|
|
input_features : array-like of str or None, default=None |
|
Only used to validate feature names with `estimator.feature_names_in_`. |
|
|
|
Returns |
|
------- |
|
feature_names_in : ndarray of str or `None` |
|
Feature names in. |
|
""" |
|
_check_feature_names_in(estimator, input_features, generate_names=False) |
|
estimator_name = estimator.__class__.__name__.lower() |
|
return np.asarray( |
|
[f"{estimator_name}{i}" for i in range(n_features_out)], dtype=object |
|
) |
|
|
|
|
|
def _check_monotonic_cst(estimator, monotonic_cst=None): |
|
"""Check the monotonic constraints and return the corresponding array. |
|
|
|
This helper function should be used in the `fit` method of an estimator |
|
that supports monotonic constraints and called after the estimator has |
|
introspected input data to set the `n_features_in_` and optionally the |
|
`feature_names_in_` attributes. |
|
|
|
.. versionadded:: 1.2 |
|
|
|
Parameters |
|
---------- |
|
estimator : estimator instance |
|
|
|
monotonic_cst : array-like of int, dict of str or None, default=None |
|
Monotonic constraints for the features. |
|
|
|
- If array-like, then it should contain only -1, 0 or 1. Each value |
|
will be checked to be in [-1, 0, 1]. If a value is -1, then the |
|
corresponding feature is required to be monotonically decreasing. |
|
- If dict, then it the keys should be the feature names occurring in |
|
`estimator.feature_names_in_` and the values should be -1, 0 or 1. |
|
- If None, then an array of 0s will be allocated. |
|
|
|
Returns |
|
------- |
|
monotonic_cst : ndarray of int |
|
Monotonic constraints for each feature. |
|
""" |
|
original_monotonic_cst = monotonic_cst |
|
if monotonic_cst is None or isinstance(monotonic_cst, dict): |
|
monotonic_cst = np.full( |
|
shape=estimator.n_features_in_, |
|
fill_value=0, |
|
dtype=np.int8, |
|
) |
|
if isinstance(original_monotonic_cst, dict): |
|
if not hasattr(estimator, "feature_names_in_"): |
|
raise ValueError( |
|
f"{estimator.__class__.__name__} was not fitted on data " |
|
"with feature names. Pass monotonic_cst as an integer " |
|
"array instead." |
|
) |
|
unexpected_feature_names = list( |
|
set(original_monotonic_cst) - set(estimator.feature_names_in_) |
|
) |
|
unexpected_feature_names.sort() |
|
n_unexpeced = len(unexpected_feature_names) |
|
if unexpected_feature_names: |
|
if len(unexpected_feature_names) > 5: |
|
unexpected_feature_names = unexpected_feature_names[:5] |
|
unexpected_feature_names.append("...") |
|
raise ValueError( |
|
f"monotonic_cst contains {n_unexpeced} unexpected feature " |
|
f"names: {unexpected_feature_names}." |
|
) |
|
for feature_idx, feature_name in enumerate(estimator.feature_names_in_): |
|
if feature_name in original_monotonic_cst: |
|
cst = original_monotonic_cst[feature_name] |
|
if cst not in [-1, 0, 1]: |
|
raise ValueError( |
|
f"monotonic_cst['{feature_name}'] must be either " |
|
f"-1, 0 or 1. Got {cst!r}." |
|
) |
|
monotonic_cst[feature_idx] = cst |
|
else: |
|
unexpected_cst = np.setdiff1d(monotonic_cst, [-1, 0, 1]) |
|
if unexpected_cst.shape[0]: |
|
raise ValueError( |
|
"monotonic_cst must be an array-like of -1, 0 or 1. Observed " |
|
f"values: {unexpected_cst.tolist()}." |
|
) |
|
|
|
monotonic_cst = np.asarray(monotonic_cst, dtype=np.int8) |
|
if monotonic_cst.shape[0] != estimator.n_features_in_: |
|
raise ValueError( |
|
f"monotonic_cst has shape {monotonic_cst.shape} but the input data " |
|
f"X has {estimator.n_features_in_} features." |
|
) |
|
return monotonic_cst |
|
|
|
|
|
def _check_pos_label_consistency(pos_label, y_true): |
|
"""Check if `pos_label` need to be specified or not. |
|
|
|
In binary classification, we fix `pos_label=1` if the labels are in the set |
|
{-1, 1} or {0, 1}. Otherwise, we raise an error asking to specify the |
|
`pos_label` parameters. |
|
|
|
Parameters |
|
---------- |
|
pos_label : int, float, bool, str or None |
|
The positive label. |
|
y_true : ndarray of shape (n_samples,) |
|
The target vector. |
|
|
|
Returns |
|
------- |
|
pos_label : int, float, bool or str |
|
If `pos_label` can be inferred, it will be returned. |
|
|
|
Raises |
|
------ |
|
ValueError |
|
In the case that `y_true` does not have label in {-1, 1} or {0, 1}, |
|
it will raise a `ValueError`. |
|
""" |
|
|
|
|
|
|
|
|
|
if pos_label is None: |
|
|
|
classes = np.unique(y_true) |
|
if classes.dtype.kind in "OUS" or not ( |
|
np.array_equal(classes, [0, 1]) |
|
or np.array_equal(classes, [-1, 1]) |
|
or np.array_equal(classes, [0]) |
|
or np.array_equal(classes, [-1]) |
|
or np.array_equal(classes, [1]) |
|
): |
|
classes_repr = ", ".join([repr(c) for c in classes.tolist()]) |
|
raise ValueError( |
|
f"y_true takes value in {{{classes_repr}}} and pos_label is not " |
|
"specified: either make y_true take value in {0, 1} or " |
|
"{-1, 1} or pass pos_label explicitly." |
|
) |
|
pos_label = 1 |
|
|
|
return pos_label |
|
|
|
|
|
def _to_object_array(sequence): |
|
"""Convert sequence to a 1-D NumPy array of object dtype. |
|
|
|
numpy.array constructor has a similar use but it's output |
|
is ambiguous. It can be 1-D NumPy array of object dtype if |
|
the input is a ragged array, but if the input is a list of |
|
equal length arrays, then the output is a 2D numpy.array. |
|
_to_object_array solves this ambiguity by guarantying that |
|
the output is a 1-D NumPy array of objects for any input. |
|
|
|
Parameters |
|
---------- |
|
sequence : array-like of shape (n_elements,) |
|
The sequence to be converted. |
|
|
|
Returns |
|
------- |
|
out : ndarray of shape (n_elements,), dtype=object |
|
The converted sequence into a 1-D NumPy array of object dtype. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.utils.validation import _to_object_array |
|
>>> _to_object_array([np.array([0]), np.array([1])]) |
|
array([array([0]), array([1])], dtype=object) |
|
>>> _to_object_array([np.array([0]), np.array([1, 2])]) |
|
array([array([0]), array([1, 2])], dtype=object) |
|
>>> _to_object_array([np.array([0]), np.array([1, 2])]) |
|
array([array([0]), array([1, 2])], dtype=object) |
|
""" |
|
out = np.empty(len(sequence), dtype=object) |
|
out[:] = sequence |
|
return out |
|
|
|
|
|
def _check_feature_names(estimator, X, *, reset): |
|
"""Set or check the `feature_names_in_` attribute of an estimator. |
|
|
|
.. versionadded:: 1.0 |
|
|
|
.. versionchanged:: 1.6 |
|
Moved from :class:`~sklearn.base.BaseEstimator` to |
|
:mod:`sklearn.utils.validation`. |
|
|
|
Parameters |
|
---------- |
|
estimator : estimator instance |
|
The estimator to validate the input for. |
|
|
|
X : {ndarray, dataframe} of shape (n_samples, n_features) |
|
The input samples. |
|
|
|
reset : bool |
|
Whether to reset the `feature_names_in_` attribute. |
|
If False, the input will be checked for consistency with |
|
feature names of data provided when reset was last True. |
|
.. note:: |
|
It is recommended to call `reset=True` in `fit` and in the first |
|
call to `partial_fit`. All other methods that validate `X` |
|
should set `reset=False`. |
|
""" |
|
|
|
if reset: |
|
feature_names_in = _get_feature_names(X) |
|
if feature_names_in is not None: |
|
estimator.feature_names_in_ = feature_names_in |
|
elif hasattr(estimator, "feature_names_in_"): |
|
|
|
|
|
delattr(estimator, "feature_names_in_") |
|
return |
|
|
|
fitted_feature_names = getattr(estimator, "feature_names_in_", None) |
|
X_feature_names = _get_feature_names(X) |
|
|
|
if fitted_feature_names is None and X_feature_names is None: |
|
|
|
return |
|
|
|
if X_feature_names is not None and fitted_feature_names is None: |
|
warnings.warn( |
|
f"X has feature names, but {estimator.__class__.__name__} was fitted " |
|
"without feature names" |
|
) |
|
return |
|
|
|
if X_feature_names is None and fitted_feature_names is not None: |
|
warnings.warn( |
|
"X does not have valid feature names, but" |
|
f" {estimator.__class__.__name__} was fitted with feature names" |
|
) |
|
return |
|
|
|
|
|
if len(fitted_feature_names) != len(X_feature_names) or np.any( |
|
fitted_feature_names != X_feature_names |
|
): |
|
message = "The feature names should match those that were passed during fit.\n" |
|
fitted_feature_names_set = set(fitted_feature_names) |
|
X_feature_names_set = set(X_feature_names) |
|
|
|
unexpected_names = sorted(X_feature_names_set - fitted_feature_names_set) |
|
missing_names = sorted(fitted_feature_names_set - X_feature_names_set) |
|
|
|
def add_names(names): |
|
output = "" |
|
max_n_names = 5 |
|
for i, name in enumerate(names): |
|
if i >= max_n_names: |
|
output += "- ...\n" |
|
break |
|
output += f"- {name}\n" |
|
return output |
|
|
|
if unexpected_names: |
|
message += "Feature names unseen at fit time:\n" |
|
message += add_names(unexpected_names) |
|
|
|
if missing_names: |
|
message += "Feature names seen at fit time, yet now missing:\n" |
|
message += add_names(missing_names) |
|
|
|
if not missing_names and not unexpected_names: |
|
message += "Feature names must be in the same order as they were in fit.\n" |
|
|
|
raise ValueError(message) |
|
|
|
|
|
def _check_n_features(estimator, X, reset): |
|
"""Set the `n_features_in_` attribute, or check against it on an estimator. |
|
|
|
.. versionchanged:: 1.6 |
|
Moved from :class:`~sklearn.base.BaseEstimator` to |
|
:mod:`~sklearn.utils.validation`. |
|
|
|
Parameters |
|
---------- |
|
estimator : estimator instance |
|
The estimator to validate the input for. |
|
|
|
X : {ndarray, sparse matrix} of shape (n_samples, n_features) |
|
The input samples. |
|
|
|
reset : bool |
|
If True, the `n_features_in_` attribute is set to `X.shape[1]`. |
|
If False and the attribute exists, then check that it is equal to |
|
`X.shape[1]`. If False and the attribute does *not* exist, then |
|
the check is skipped. |
|
.. note:: |
|
It is recommended to call reset=True in `fit` and in the first |
|
call to `partial_fit`. All other methods that validate `X` |
|
should set `reset=False`. |
|
""" |
|
try: |
|
n_features = _num_features(X) |
|
except TypeError as e: |
|
if not reset and hasattr(estimator, "n_features_in_"): |
|
raise ValueError( |
|
"X does not contain any features, but " |
|
f"{estimator.__class__.__name__} is expecting " |
|
f"{estimator.n_features_in_} features" |
|
) from e |
|
|
|
|
|
return |
|
|
|
if reset: |
|
estimator.n_features_in_ = n_features |
|
return |
|
|
|
if not hasattr(estimator, "n_features_in_"): |
|
|
|
|
|
|
|
return |
|
|
|
if n_features != estimator.n_features_in_: |
|
raise ValueError( |
|
f"X has {n_features} features, but {estimator.__class__.__name__} " |
|
f"is expecting {estimator.n_features_in_} features as input." |
|
) |
|
|
|
|
|
def validate_data( |
|
_estimator, |
|
/, |
|
X="no_validation", |
|
y="no_validation", |
|
reset=True, |
|
validate_separately=False, |
|
skip_check_array=False, |
|
**check_params, |
|
): |
|
"""Validate input data and set or check feature names and counts of the input. |
|
|
|
This helper function should be used in an estimator that requires input |
|
validation. This mutates the estimator and sets the `n_features_in_` and |
|
`feature_names_in_` attributes if `reset=True`. |
|
|
|
.. versionadded:: 1.6 |
|
|
|
Parameters |
|
---------- |
|
_estimator : estimator instance |
|
The estimator to validate the input for. |
|
|
|
X : {array-like, sparse matrix, dataframe} of shape \ |
|
(n_samples, n_features), default='no validation' |
|
The input samples. |
|
If `'no_validation'`, no validation is performed on `X`. This is |
|
useful for meta-estimator which can delegate input validation to |
|
their underlying estimator(s). In that case `y` must be passed and |
|
the only accepted `check_params` are `multi_output` and |
|
`y_numeric`. |
|
|
|
y : array-like of shape (n_samples,), default='no_validation' |
|
The targets. |
|
|
|
- If `None`, :func:`~sklearn.utils.check_array` is called on `X`. If |
|
the estimator's `requires_y` tag is True, then an error will be raised. |
|
- If `'no_validation'`, :func:`~sklearn.utils.check_array` is called |
|
on `X` and the estimator's `requires_y` tag is ignored. This is a default |
|
placeholder and is never meant to be explicitly set. In that case `X` must be |
|
passed. |
|
- Otherwise, only `y` with `_check_y` or both `X` and `y` are checked with |
|
either :func:`~sklearn.utils.check_array` or |
|
:func:`~sklearn.utils.check_X_y` depending on `validate_separately`. |
|
|
|
reset : bool, default=True |
|
Whether to reset the `n_features_in_` attribute. |
|
If False, the input will be checked for consistency with data |
|
provided when reset was last True. |
|
|
|
.. note:: |
|
|
|
It is recommended to call `reset=True` in `fit` and in the first |
|
call to `partial_fit`. All other methods that validate `X` |
|
should set `reset=False`. |
|
|
|
validate_separately : False or tuple of dicts, default=False |
|
Only used if `y` is not `None`. |
|
If `False`, call :func:`~sklearn.utils.check_X_y`. Else, it must be a tuple of |
|
kwargs to be used for calling :func:`~sklearn.utils.check_array` on `X` and `y` |
|
respectively. |
|
|
|
`estimator=self` is automatically added to these dicts to generate |
|
more informative error message in case of invalid input data. |
|
|
|
skip_check_array : bool, default=False |
|
If `True`, `X` and `y` are unchanged and only `feature_names_in_` and |
|
`n_features_in_` are checked. Otherwise, :func:`~sklearn.utils.check_array` |
|
is called on `X` and `y`. |
|
|
|
**check_params : kwargs |
|
Parameters passed to :func:`~sklearn.utils.check_array` or |
|
:func:`~sklearn.utils.check_X_y`. Ignored if validate_separately |
|
is not False. |
|
|
|
`estimator=self` is automatically added to these params to generate |
|
more informative error message in case of invalid input data. |
|
|
|
Returns |
|
------- |
|
out : {ndarray, sparse matrix} or tuple of these |
|
The validated input. A tuple is returned if both `X` and `y` are |
|
validated. |
|
""" |
|
_check_feature_names(_estimator, X, reset=reset) |
|
tags = get_tags(_estimator) |
|
if y is None and tags.target_tags.required: |
|
raise ValueError( |
|
f"This {_estimator.__class__.__name__} estimator " |
|
"requires y to be passed, but the target y is None." |
|
) |
|
|
|
no_val_X = isinstance(X, str) and X == "no_validation" |
|
no_val_y = y is None or isinstance(y, str) and y == "no_validation" |
|
|
|
if no_val_X and no_val_y: |
|
raise ValueError("Validation should be done on X, y or both.") |
|
|
|
default_check_params = {"estimator": _estimator} |
|
check_params = {**default_check_params, **check_params} |
|
|
|
if skip_check_array: |
|
if not no_val_X and no_val_y: |
|
out = X |
|
elif no_val_X and not no_val_y: |
|
out = y |
|
else: |
|
out = X, y |
|
elif not no_val_X and no_val_y: |
|
out = check_array(X, input_name="X", **check_params) |
|
elif no_val_X and not no_val_y: |
|
out = _check_y(y, **check_params) |
|
else: |
|
if validate_separately: |
|
|
|
|
|
|
|
|
|
check_X_params, check_y_params = validate_separately |
|
if "estimator" not in check_X_params: |
|
check_X_params = {**default_check_params, **check_X_params} |
|
X = check_array(X, input_name="X", **check_X_params) |
|
if "estimator" not in check_y_params: |
|
check_y_params = {**default_check_params, **check_y_params} |
|
y = check_array(y, input_name="y", **check_y_params) |
|
else: |
|
X, y = check_X_y(X, y, **check_params) |
|
out = X, y |
|
|
|
if not no_val_X and check_params.get("ensure_2d", True): |
|
_check_n_features(_estimator, X, reset=reset) |
|
|
|
return out |
|
|