|
""" |
|
The :mod:`sklearn.model_selection._search` includes utilities to fine-tune the |
|
parameters of an estimator. |
|
""" |
|
|
|
|
|
|
|
|
|
import numbers |
|
import operator |
|
import time |
|
import warnings |
|
from abc import ABCMeta, abstractmethod |
|
from collections import defaultdict |
|
from collections.abc import Iterable, Mapping, Sequence |
|
from copy import deepcopy |
|
from functools import partial, reduce |
|
from itertools import product |
|
|
|
import numpy as np |
|
from numpy.ma import MaskedArray |
|
from scipy.stats import rankdata |
|
|
|
from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone, is_classifier |
|
from ..exceptions import NotFittedError |
|
from ..metrics import check_scoring |
|
from ..metrics._scorer import ( |
|
_check_multimetric_scoring, |
|
_MultimetricScorer, |
|
get_scorer_names, |
|
) |
|
from ..utils import Bunch, check_random_state |
|
from ..utils._estimator_html_repr import _VisualBlock |
|
from ..utils._param_validation import HasMethods, Interval, StrOptions |
|
from ..utils._tags import get_tags |
|
from ..utils.deprecation import _deprecate_Xt_in_inverse_transform |
|
from ..utils.metadata_routing import ( |
|
MetadataRouter, |
|
MethodMapping, |
|
_raise_for_params, |
|
_routing_enabled, |
|
process_routing, |
|
) |
|
from ..utils.metaestimators import available_if |
|
from ..utils.parallel import Parallel, delayed |
|
from ..utils.random import sample_without_replacement |
|
from ..utils.validation import _check_method_params, check_is_fitted, indexable |
|
from ._split import check_cv |
|
from ._validation import ( |
|
_aggregate_score_dicts, |
|
_fit_and_score, |
|
_insert_error_scores, |
|
_normalize_score_results, |
|
_warn_or_raise_about_fit_failures, |
|
) |
|
|
|
__all__ = ["GridSearchCV", "ParameterGrid", "ParameterSampler", "RandomizedSearchCV"] |
|
|
|
|
|
class ParameterGrid: |
|
"""Grid of parameters with a discrete number of values for each. |
|
|
|
Can be used to iterate over parameter value combinations with the |
|
Python built-in function iter. |
|
The order of the generated parameter combinations is deterministic. |
|
|
|
Read more in the :ref:`User Guide <grid_search>`. |
|
|
|
Parameters |
|
---------- |
|
param_grid : dict of str to sequence, or sequence of such |
|
The parameter grid to explore, as a dictionary mapping estimator |
|
parameters to sequences of allowed values. |
|
|
|
An empty dict signifies default parameters. |
|
|
|
A sequence of dicts signifies a sequence of grids to search, and is |
|
useful to avoid exploring parameter combinations that make no sense |
|
or have no effect. See the examples below. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.model_selection import ParameterGrid |
|
>>> param_grid = {'a': [1, 2], 'b': [True, False]} |
|
>>> list(ParameterGrid(param_grid)) == ( |
|
... [{'a': 1, 'b': True}, {'a': 1, 'b': False}, |
|
... {'a': 2, 'b': True}, {'a': 2, 'b': False}]) |
|
True |
|
|
|
>>> grid = [{'kernel': ['linear']}, {'kernel': ['rbf'], 'gamma': [1, 10]}] |
|
>>> list(ParameterGrid(grid)) == [{'kernel': 'linear'}, |
|
... {'kernel': 'rbf', 'gamma': 1}, |
|
... {'kernel': 'rbf', 'gamma': 10}] |
|
True |
|
>>> ParameterGrid(grid)[1] == {'kernel': 'rbf', 'gamma': 1} |
|
True |
|
|
|
See Also |
|
-------- |
|
GridSearchCV : Uses :class:`ParameterGrid` to perform a full parallelized |
|
parameter search. |
|
""" |
|
|
|
def __init__(self, param_grid): |
|
if not isinstance(param_grid, (Mapping, Iterable)): |
|
raise TypeError( |
|
f"Parameter grid should be a dict or a list, got: {param_grid!r} of" |
|
f" type {type(param_grid).__name__}" |
|
) |
|
|
|
if isinstance(param_grid, Mapping): |
|
|
|
|
|
param_grid = [param_grid] |
|
|
|
|
|
for grid in param_grid: |
|
if not isinstance(grid, dict): |
|
raise TypeError(f"Parameter grid is not a dict ({grid!r})") |
|
for key, value in grid.items(): |
|
if isinstance(value, np.ndarray) and value.ndim > 1: |
|
raise ValueError( |
|
f"Parameter array for {key!r} should be one-dimensional, got:" |
|
f" {value!r} with shape {value.shape}" |
|
) |
|
if isinstance(value, str) or not isinstance( |
|
value, (np.ndarray, Sequence) |
|
): |
|
raise TypeError( |
|
f"Parameter grid for parameter {key!r} needs to be a list or a" |
|
f" numpy array, but got {value!r} (of type " |
|
f"{type(value).__name__}) instead. Single values " |
|
"need to be wrapped in a list with one element." |
|
) |
|
if len(value) == 0: |
|
raise ValueError( |
|
f"Parameter grid for parameter {key!r} need " |
|
f"to be a non-empty sequence, got: {value!r}" |
|
) |
|
|
|
self.param_grid = param_grid |
|
|
|
def __iter__(self): |
|
"""Iterate over the points in the grid. |
|
|
|
Returns |
|
------- |
|
params : iterator over dict of str to any |
|
Yields dictionaries mapping each estimator parameter to one of its |
|
allowed values. |
|
""" |
|
for p in self.param_grid: |
|
|
|
items = sorted(p.items()) |
|
if not items: |
|
yield {} |
|
else: |
|
keys, values = zip(*items) |
|
for v in product(*values): |
|
params = dict(zip(keys, v)) |
|
yield params |
|
|
|
def __len__(self): |
|
"""Number of points on the grid.""" |
|
|
|
product = partial(reduce, operator.mul) |
|
return sum( |
|
product(len(v) for v in p.values()) if p else 1 for p in self.param_grid |
|
) |
|
|
|
def __getitem__(self, ind): |
|
"""Get the parameters that would be ``ind``th in iteration |
|
|
|
Parameters |
|
---------- |
|
ind : int |
|
The iteration index |
|
|
|
Returns |
|
------- |
|
params : dict of str to any |
|
Equal to list(self)[ind] |
|
""" |
|
|
|
|
|
for sub_grid in self.param_grid: |
|
|
|
if not sub_grid: |
|
if ind == 0: |
|
return {} |
|
else: |
|
ind -= 1 |
|
continue |
|
|
|
|
|
keys, values_lists = zip(*sorted(sub_grid.items())[::-1]) |
|
sizes = [len(v_list) for v_list in values_lists] |
|
total = np.prod(sizes) |
|
|
|
if ind >= total: |
|
|
|
ind -= total |
|
else: |
|
out = {} |
|
for key, v_list, n in zip(keys, values_lists, sizes): |
|
ind, offset = divmod(ind, n) |
|
out[key] = v_list[offset] |
|
return out |
|
|
|
raise IndexError("ParameterGrid index out of range") |
|
|
|
|
|
class ParameterSampler: |
|
"""Generator on parameters sampled from given distributions. |
|
|
|
Non-deterministic iterable over random candidate combinations for hyper- |
|
parameter search. If all parameters are presented as a list, |
|
sampling without replacement is performed. If at least one parameter |
|
is given as a distribution, sampling with replacement is used. |
|
It is highly recommended to use continuous distributions for continuous |
|
parameters. |
|
|
|
Read more in the :ref:`User Guide <grid_search>`. |
|
|
|
Parameters |
|
---------- |
|
param_distributions : dict |
|
Dictionary with parameters names (`str`) as keys and distributions |
|
or lists of parameters to try. Distributions must provide a ``rvs`` |
|
method for sampling (such as those from scipy.stats.distributions). |
|
If a list is given, it is sampled uniformly. |
|
If a list of dicts is given, first a dict is sampled uniformly, and |
|
then a parameter is sampled using that dict as above. |
|
|
|
n_iter : int |
|
Number of parameter settings that are produced. |
|
|
|
random_state : int, RandomState instance or None, default=None |
|
Pseudo random number generator state used for random uniform sampling |
|
from lists of possible values instead of scipy.stats distributions. |
|
Pass an int for reproducible output across multiple |
|
function calls. |
|
See :term:`Glossary <random_state>`. |
|
|
|
Returns |
|
------- |
|
params : dict of str to any |
|
**Yields** dictionaries mapping each estimator parameter to |
|
as sampled value. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.model_selection import ParameterSampler |
|
>>> from scipy.stats.distributions import expon |
|
>>> import numpy as np |
|
>>> rng = np.random.RandomState(0) |
|
>>> param_grid = {'a':[1, 2], 'b': expon()} |
|
>>> param_list = list(ParameterSampler(param_grid, n_iter=4, |
|
... random_state=rng)) |
|
>>> rounded_list = [dict((k, round(v, 6)) for (k, v) in d.items()) |
|
... for d in param_list] |
|
>>> rounded_list == [{'b': 0.89856, 'a': 1}, |
|
... {'b': 0.923223, 'a': 1}, |
|
... {'b': 1.878964, 'a': 2}, |
|
... {'b': 1.038159, 'a': 2}] |
|
True |
|
""" |
|
|
|
def __init__(self, param_distributions, n_iter, *, random_state=None): |
|
if not isinstance(param_distributions, (Mapping, Iterable)): |
|
raise TypeError( |
|
"Parameter distribution is not a dict or a list," |
|
f" got: {param_distributions!r} of type " |
|
f"{type(param_distributions).__name__}" |
|
) |
|
|
|
if isinstance(param_distributions, Mapping): |
|
|
|
|
|
param_distributions = [param_distributions] |
|
|
|
for dist in param_distributions: |
|
if not isinstance(dist, dict): |
|
raise TypeError( |
|
"Parameter distribution is not a dict ({!r})".format(dist) |
|
) |
|
for key in dist: |
|
if not isinstance(dist[key], Iterable) and not hasattr( |
|
dist[key], "rvs" |
|
): |
|
raise TypeError( |
|
f"Parameter grid for parameter {key!r} is not iterable " |
|
f"or a distribution (value={dist[key]})" |
|
) |
|
self.n_iter = n_iter |
|
self.random_state = random_state |
|
self.param_distributions = param_distributions |
|
|
|
def _is_all_lists(self): |
|
return all( |
|
all(not hasattr(v, "rvs") for v in dist.values()) |
|
for dist in self.param_distributions |
|
) |
|
|
|
def __iter__(self): |
|
rng = check_random_state(self.random_state) |
|
|
|
|
|
|
|
if self._is_all_lists(): |
|
|
|
param_grid = ParameterGrid(self.param_distributions) |
|
grid_size = len(param_grid) |
|
n_iter = self.n_iter |
|
|
|
if grid_size < n_iter: |
|
warnings.warn( |
|
"The total space of parameters %d is smaller " |
|
"than n_iter=%d. Running %d iterations. For exhaustive " |
|
"searches, use GridSearchCV." % (grid_size, self.n_iter, grid_size), |
|
UserWarning, |
|
) |
|
n_iter = grid_size |
|
for i in sample_without_replacement(grid_size, n_iter, random_state=rng): |
|
yield param_grid[i] |
|
|
|
else: |
|
for _ in range(self.n_iter): |
|
dist = rng.choice(self.param_distributions) |
|
|
|
items = sorted(dist.items()) |
|
params = dict() |
|
for k, v in items: |
|
if hasattr(v, "rvs"): |
|
params[k] = v.rvs(random_state=rng) |
|
else: |
|
params[k] = v[rng.randint(len(v))] |
|
yield params |
|
|
|
def __len__(self): |
|
"""Number of points that will be sampled.""" |
|
if self._is_all_lists(): |
|
grid_size = len(ParameterGrid(self.param_distributions)) |
|
return min(self.n_iter, grid_size) |
|
else: |
|
return self.n_iter |
|
|
|
|
|
def _check_refit(search_cv, attr): |
|
if not search_cv.refit: |
|
raise AttributeError( |
|
f"This {type(search_cv).__name__} instance was initialized with " |
|
f"`refit=False`. {attr} is available only after refitting on the best " |
|
"parameters. You can refit an estimator manually using the " |
|
"`best_params_` attribute" |
|
) |
|
|
|
|
|
def _search_estimator_has(attr): |
|
"""Check if we can delegate a method to the underlying estimator. |
|
|
|
Calling a prediction method will only be available if `refit=True`. In |
|
such case, we check first the fitted best estimator. If it is not |
|
fitted, we check the unfitted estimator. |
|
|
|
Checking the unfitted estimator allows to use `hasattr` on the `SearchCV` |
|
instance even before calling `fit`. |
|
""" |
|
|
|
def check(self): |
|
_check_refit(self, attr) |
|
if hasattr(self, "best_estimator_"): |
|
|
|
getattr(self.best_estimator_, attr) |
|
return True |
|
|
|
getattr(self.estimator, attr) |
|
return True |
|
|
|
return check |
|
|
|
|
|
def _yield_masked_array_for_each_param(candidate_params): |
|
""" |
|
Yield a masked array for each candidate param. |
|
|
|
`candidate_params` is a sequence of params which were used in |
|
a `GridSearchCV`. We use masked arrays for the results, as not |
|
all params are necessarily present in each element of |
|
`candidate_params`. For example, if using `GridSearchCV` with |
|
a `SVC` model, then one might search over params like: |
|
|
|
- kernel=["rbf"], gamma=[0.1, 1] |
|
- kernel=["poly"], degree=[1, 2] |
|
|
|
and then param `'gamma'` would not be present in entries of |
|
`candidate_params` corresponding to `kernel='poly'`. |
|
""" |
|
n_candidates = len(candidate_params) |
|
param_results = defaultdict(dict) |
|
|
|
for cand_idx, params in enumerate(candidate_params): |
|
for name, value in params.items(): |
|
param_results["param_%s" % name][cand_idx] = value |
|
|
|
for key, param_result in param_results.items(): |
|
param_list = list(param_result.values()) |
|
try: |
|
arr = np.array(param_list) |
|
except ValueError: |
|
|
|
|
|
|
|
arr_dtype = np.dtype(object) |
|
else: |
|
|
|
|
|
|
|
|
|
|
|
|
|
arr_dtype = arr.dtype if arr.dtype.kind != "U" and arr.ndim == 1 else object |
|
|
|
|
|
|
|
ma = MaskedArray(np.empty(n_candidates, dtype=arr_dtype), mask=True) |
|
for index, value in param_result.items(): |
|
|
|
ma[index] = value |
|
yield (key, ma) |
|
|
|
|
|
class BaseSearchCV(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta): |
|
"""Abstract base class for hyper parameter search with cross-validation.""" |
|
|
|
_parameter_constraints: dict = { |
|
"estimator": [HasMethods(["fit"])], |
|
"scoring": [ |
|
StrOptions(set(get_scorer_names())), |
|
callable, |
|
list, |
|
tuple, |
|
dict, |
|
None, |
|
], |
|
"n_jobs": [numbers.Integral, None], |
|
"refit": ["boolean", str, callable], |
|
"cv": ["cv_object"], |
|
"verbose": ["verbose"], |
|
"pre_dispatch": [numbers.Integral, str], |
|
"error_score": [StrOptions({"raise"}), numbers.Real], |
|
"return_train_score": ["boolean"], |
|
} |
|
|
|
@abstractmethod |
|
def __init__( |
|
self, |
|
estimator, |
|
*, |
|
scoring=None, |
|
n_jobs=None, |
|
refit=True, |
|
cv=None, |
|
verbose=0, |
|
pre_dispatch="2*n_jobs", |
|
error_score=np.nan, |
|
return_train_score=True, |
|
): |
|
self.scoring = scoring |
|
self.estimator = estimator |
|
self.n_jobs = n_jobs |
|
self.refit = refit |
|
self.cv = cv |
|
self.verbose = verbose |
|
self.pre_dispatch = pre_dispatch |
|
self.error_score = error_score |
|
self.return_train_score = return_train_score |
|
|
|
@property |
|
|
|
def _estimator_type(self): |
|
return self.estimator._estimator_type |
|
|
|
def __sklearn_tags__(self): |
|
tags = super().__sklearn_tags__() |
|
sub_estimator_tags = get_tags(self.estimator) |
|
tags.estimator_type = sub_estimator_tags.estimator_type |
|
tags.classifier_tags = deepcopy(sub_estimator_tags.classifier_tags) |
|
tags.regressor_tags = deepcopy(sub_estimator_tags.regressor_tags) |
|
|
|
tags.input_tags.pairwise = sub_estimator_tags.input_tags.pairwise |
|
tags.input_tags.sparse = sub_estimator_tags.input_tags.sparse |
|
tags.array_api_support = sub_estimator_tags.array_api_support |
|
return tags |
|
|
|
def score(self, X, y=None, **params): |
|
"""Return the score on the given data, if the estimator has been refit. |
|
|
|
This uses the score defined by ``scoring`` where provided, and the |
|
``best_estimator_.score`` method otherwise. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features) |
|
Input data, where `n_samples` is the number of samples and |
|
`n_features` is the number of features. |
|
|
|
y : array-like of shape (n_samples, n_output) \ |
|
or (n_samples,), default=None |
|
Target relative to X for classification or regression; |
|
None for unsupervised learning. |
|
|
|
**params : dict |
|
Parameters to be passed to the underlying scorer(s). |
|
|
|
.. versionadded:: 1.4 |
|
Only available if `enable_metadata_routing=True`. See |
|
:ref:`Metadata Routing User Guide <metadata_routing>` for more |
|
details. |
|
|
|
Returns |
|
------- |
|
score : float |
|
The score defined by ``scoring`` if provided, and the |
|
``best_estimator_.score`` method otherwise. |
|
""" |
|
_check_refit(self, "score") |
|
check_is_fitted(self) |
|
|
|
_raise_for_params(params, self, "score") |
|
|
|
if _routing_enabled(): |
|
score_params = process_routing(self, "score", **params).scorer["score"] |
|
else: |
|
score_params = dict() |
|
|
|
if self.scorer_ is None: |
|
raise ValueError( |
|
"No score function explicitly defined, " |
|
"and the estimator doesn't provide one %s" % self.best_estimator_ |
|
) |
|
if isinstance(self.scorer_, dict): |
|
if self.multimetric_: |
|
scorer = self.scorer_[self.refit] |
|
else: |
|
scorer = self.scorer_ |
|
return scorer(self.best_estimator_, X, y, **score_params) |
|
|
|
|
|
score = self.scorer_(self.best_estimator_, X, y, **score_params) |
|
if self.multimetric_: |
|
score = score[self.refit] |
|
return score |
|
|
|
@available_if(_search_estimator_has("score_samples")) |
|
def score_samples(self, X): |
|
"""Call score_samples on the estimator with the best found parameters. |
|
|
|
Only available if ``refit=True`` and the underlying estimator supports |
|
``score_samples``. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
Parameters |
|
---------- |
|
X : iterable |
|
Data to predict on. Must fulfill input requirements |
|
of the underlying estimator. |
|
|
|
Returns |
|
------- |
|
y_score : ndarray of shape (n_samples,) |
|
The ``best_estimator_.score_samples`` method. |
|
""" |
|
check_is_fitted(self) |
|
return self.best_estimator_.score_samples(X) |
|
|
|
@available_if(_search_estimator_has("predict")) |
|
def predict(self, X): |
|
"""Call predict on the estimator with the best found parameters. |
|
|
|
Only available if ``refit=True`` and the underlying estimator supports |
|
``predict``. |
|
|
|
Parameters |
|
---------- |
|
X : indexable, length n_samples |
|
Must fulfill the input assumptions of the |
|
underlying estimator. |
|
|
|
Returns |
|
------- |
|
y_pred : ndarray of shape (n_samples,) |
|
The predicted labels or values for `X` based on the estimator with |
|
the best found parameters. |
|
""" |
|
check_is_fitted(self) |
|
return self.best_estimator_.predict(X) |
|
|
|
@available_if(_search_estimator_has("predict_proba")) |
|
def predict_proba(self, X): |
|
"""Call predict_proba on the estimator with the best found parameters. |
|
|
|
Only available if ``refit=True`` and the underlying estimator supports |
|
``predict_proba``. |
|
|
|
Parameters |
|
---------- |
|
X : indexable, length n_samples |
|
Must fulfill the input assumptions of the |
|
underlying estimator. |
|
|
|
Returns |
|
------- |
|
y_pred : ndarray of shape (n_samples,) or (n_samples, n_classes) |
|
Predicted class probabilities for `X` based on the estimator with |
|
the best found parameters. The order of the classes corresponds |
|
to that in the fitted attribute :term:`classes_`. |
|
""" |
|
check_is_fitted(self) |
|
return self.best_estimator_.predict_proba(X) |
|
|
|
@available_if(_search_estimator_has("predict_log_proba")) |
|
def predict_log_proba(self, X): |
|
"""Call predict_log_proba on the estimator with the best found parameters. |
|
|
|
Only available if ``refit=True`` and the underlying estimator supports |
|
``predict_log_proba``. |
|
|
|
Parameters |
|
---------- |
|
X : indexable, length n_samples |
|
Must fulfill the input assumptions of the |
|
underlying estimator. |
|
|
|
Returns |
|
------- |
|
y_pred : ndarray of shape (n_samples,) or (n_samples, n_classes) |
|
Predicted class log-probabilities for `X` based on the estimator |
|
with the best found parameters. The order of the classes |
|
corresponds to that in the fitted attribute :term:`classes_`. |
|
""" |
|
check_is_fitted(self) |
|
return self.best_estimator_.predict_log_proba(X) |
|
|
|
@available_if(_search_estimator_has("decision_function")) |
|
def decision_function(self, X): |
|
"""Call decision_function on the estimator with the best found parameters. |
|
|
|
Only available if ``refit=True`` and the underlying estimator supports |
|
``decision_function``. |
|
|
|
Parameters |
|
---------- |
|
X : indexable, length n_samples |
|
Must fulfill the input assumptions of the |
|
underlying estimator. |
|
|
|
Returns |
|
------- |
|
y_score : ndarray of shape (n_samples,) or (n_samples, n_classes) \ |
|
or (n_samples, n_classes * (n_classes-1) / 2) |
|
Result of the decision function for `X` based on the estimator with |
|
the best found parameters. |
|
""" |
|
check_is_fitted(self) |
|
return self.best_estimator_.decision_function(X) |
|
|
|
@available_if(_search_estimator_has("transform")) |
|
def transform(self, X): |
|
"""Call transform on the estimator with the best found parameters. |
|
|
|
Only available if the underlying estimator supports ``transform`` and |
|
``refit=True``. |
|
|
|
Parameters |
|
---------- |
|
X : indexable, length n_samples |
|
Must fulfill the input assumptions of the |
|
underlying estimator. |
|
|
|
Returns |
|
------- |
|
Xt : {ndarray, sparse matrix} of shape (n_samples, n_features) |
|
`X` transformed in the new space based on the estimator with |
|
the best found parameters. |
|
""" |
|
check_is_fitted(self) |
|
return self.best_estimator_.transform(X) |
|
|
|
@available_if(_search_estimator_has("inverse_transform")) |
|
def inverse_transform(self, X=None, Xt=None): |
|
"""Call inverse_transform on the estimator with the best found params. |
|
|
|
Only available if the underlying estimator implements |
|
``inverse_transform`` and ``refit=True``. |
|
|
|
Parameters |
|
---------- |
|
X : indexable, length n_samples |
|
Must fulfill the input assumptions of the |
|
underlying estimator. |
|
|
|
Xt : indexable, length n_samples |
|
Must fulfill the input assumptions of the |
|
underlying estimator. |
|
|
|
.. deprecated:: 1.5 |
|
`Xt` was deprecated in 1.5 and will be removed in 1.7. Use `X` instead. |
|
|
|
Returns |
|
------- |
|
X : {ndarray, sparse matrix} of shape (n_samples, n_features) |
|
Result of the `inverse_transform` function for `Xt` based on the |
|
estimator with the best found parameters. |
|
""" |
|
X = _deprecate_Xt_in_inverse_transform(X, Xt) |
|
check_is_fitted(self) |
|
return self.best_estimator_.inverse_transform(X) |
|
|
|
@property |
|
def n_features_in_(self): |
|
"""Number of features seen during :term:`fit`. |
|
|
|
Only available when `refit=True`. |
|
""" |
|
|
|
|
|
try: |
|
check_is_fitted(self) |
|
except NotFittedError as nfe: |
|
raise AttributeError( |
|
"{} object has no n_features_in_ attribute.".format( |
|
self.__class__.__name__ |
|
) |
|
) from nfe |
|
|
|
return self.best_estimator_.n_features_in_ |
|
|
|
@property |
|
def classes_(self): |
|
"""Class labels. |
|
|
|
Only available when `refit=True` and the estimator is a classifier. |
|
""" |
|
_search_estimator_has("classes_")(self) |
|
return self.best_estimator_.classes_ |
|
|
|
def _run_search(self, evaluate_candidates): |
|
"""Repeatedly calls `evaluate_candidates` to conduct a search. |
|
|
|
This method, implemented in sub-classes, makes it possible to |
|
customize the scheduling of evaluations: GridSearchCV and |
|
RandomizedSearchCV schedule evaluations for their whole parameter |
|
search space at once but other more sequential approaches are also |
|
possible: for instance is possible to iteratively schedule evaluations |
|
for new regions of the parameter search space based on previously |
|
collected evaluation results. This makes it possible to implement |
|
Bayesian optimization or more generally sequential model-based |
|
optimization by deriving from the BaseSearchCV abstract base class. |
|
For example, Successive Halving is implemented by calling |
|
`evaluate_candidates` multiples times (once per iteration of the SH |
|
process), each time passing a different set of candidates with `X` |
|
and `y` of increasing sizes. |
|
|
|
Parameters |
|
---------- |
|
evaluate_candidates : callable |
|
This callback accepts: |
|
- a list of candidates, where each candidate is a dict of |
|
parameter settings. |
|
- an optional `cv` parameter which can be used to e.g. |
|
evaluate candidates on different dataset splits, or |
|
evaluate candidates on subsampled data (as done in the |
|
SucessiveHaling estimators). By default, the original `cv` |
|
parameter is used, and it is available as a private |
|
`_checked_cv_orig` attribute. |
|
- an optional `more_results` dict. Each key will be added to |
|
the `cv_results_` attribute. Values should be lists of |
|
length `n_candidates` |
|
|
|
It returns a dict of all results so far, formatted like |
|
``cv_results_``. |
|
|
|
Important note (relevant whether the default cv is used or not): |
|
in randomized splitters, and unless the random_state parameter of |
|
cv was set to an int, calling cv.split() multiple times will |
|
yield different splits. Since cv.split() is called in |
|
evaluate_candidates, this means that candidates will be evaluated |
|
on different splits each time evaluate_candidates is called. This |
|
might be a methodological issue depending on the search strategy |
|
that you're implementing. To prevent randomized splitters from |
|
being used, you may use _split._yields_constant_splits() |
|
|
|
Examples |
|
-------- |
|
|
|
:: |
|
|
|
def _run_search(self, evaluate_candidates): |
|
'Try C=0.1 only if C=1 is better than C=10' |
|
all_results = evaluate_candidates([{'C': 1}, {'C': 10}]) |
|
score = all_results['mean_test_score'] |
|
if score[0] < score[1]: |
|
evaluate_candidates([{'C': 0.1}]) |
|
""" |
|
raise NotImplementedError("_run_search not implemented.") |
|
|
|
def _check_refit_for_multimetric(self, scores): |
|
"""Check `refit` is compatible with `scores` is valid""" |
|
multimetric_refit_msg = ( |
|
"For multi-metric scoring, the parameter refit must be set to a " |
|
"scorer key or a callable to refit an estimator with the best " |
|
"parameter setting on the whole data and make the best_* " |
|
"attributes available for that metric. If this is not needed, " |
|
f"refit should be set to False explicitly. {self.refit!r} was " |
|
"passed." |
|
) |
|
|
|
valid_refit_dict = isinstance(self.refit, str) and self.refit in scores |
|
|
|
if ( |
|
self.refit is not False |
|
and not valid_refit_dict |
|
and not callable(self.refit) |
|
): |
|
raise ValueError(multimetric_refit_msg) |
|
|
|
@staticmethod |
|
def _select_best_index(refit, refit_metric, results): |
|
"""Select index of the best combination of hyperparemeters.""" |
|
if callable(refit): |
|
|
|
|
|
best_index = refit(results) |
|
if not isinstance(best_index, numbers.Integral): |
|
raise TypeError("best_index_ returned is not an integer") |
|
if best_index < 0 or best_index >= len(results["params"]): |
|
raise IndexError("best_index_ index out of range") |
|
else: |
|
best_index = results[f"rank_test_{refit_metric}"].argmin() |
|
return best_index |
|
|
|
def _get_scorers(self): |
|
"""Get the scorer(s) to be used. |
|
|
|
This is used in ``fit`` and ``get_metadata_routing``. |
|
|
|
Returns |
|
------- |
|
scorers, refit_metric |
|
""" |
|
refit_metric = "score" |
|
|
|
if callable(self.scoring): |
|
scorers = self.scoring |
|
elif self.scoring is None or isinstance(self.scoring, str): |
|
scorers = check_scoring(self.estimator, self.scoring) |
|
else: |
|
scorers = _check_multimetric_scoring(self.estimator, self.scoring) |
|
self._check_refit_for_multimetric(scorers) |
|
refit_metric = self.refit |
|
scorers = _MultimetricScorer( |
|
scorers=scorers, raise_exc=(self.error_score == "raise") |
|
) |
|
|
|
return scorers, refit_metric |
|
|
|
def _get_routed_params_for_fit(self, params): |
|
"""Get the parameters to be used for routing. |
|
|
|
This is a method instead of a snippet in ``fit`` since it's used twice, |
|
here in ``fit``, and in ``HalvingRandomSearchCV.fit``. |
|
""" |
|
if _routing_enabled(): |
|
routed_params = process_routing(self, "fit", **params) |
|
else: |
|
params = params.copy() |
|
groups = params.pop("groups", None) |
|
routed_params = Bunch( |
|
estimator=Bunch(fit=params), |
|
splitter=Bunch(split={"groups": groups}), |
|
scorer=Bunch(score={}), |
|
) |
|
return routed_params |
|
|
|
@_fit_context( |
|
|
|
prefer_skip_nested_validation=False |
|
) |
|
def fit(self, X, y=None, **params): |
|
"""Run fit with all sets of parameters. |
|
|
|
Parameters |
|
---------- |
|
|
|
X : array-like of shape (n_samples, n_features) or (n_samples, n_samples) |
|
Training vectors, where `n_samples` is the number of samples and |
|
`n_features` is the number of features. For precomputed kernel or |
|
distance matrix, the expected shape of X is (n_samples, n_samples). |
|
|
|
y : array-like of shape (n_samples, n_output) \ |
|
or (n_samples,), default=None |
|
Target relative to X for classification or regression; |
|
None for unsupervised learning. |
|
|
|
**params : dict of str -> object |
|
Parameters passed to the ``fit`` method of the estimator, the scorer, |
|
and the CV splitter. |
|
|
|
If a fit parameter is an array-like whose length is equal to |
|
`num_samples` then it will be split by cross-validation along with |
|
`X` and `y`. For example, the :term:`sample_weight` parameter is |
|
split because `len(sample_weights) = len(X)`. However, this behavior |
|
does not apply to `groups` which is passed to the splitter configured |
|
via the `cv` parameter of the constructor. Thus, `groups` is used |
|
*to perform the split* and determines which samples are |
|
assigned to the each side of the a split. |
|
|
|
Returns |
|
------- |
|
self : object |
|
Instance of fitted estimator. |
|
""" |
|
estimator = self.estimator |
|
scorers, refit_metric = self._get_scorers() |
|
|
|
X, y = indexable(X, y) |
|
params = _check_method_params(X, params=params) |
|
|
|
routed_params = self._get_routed_params_for_fit(params) |
|
|
|
cv_orig = check_cv(self.cv, y, classifier=is_classifier(estimator)) |
|
n_splits = cv_orig.get_n_splits(X, y, **routed_params.splitter.split) |
|
|
|
base_estimator = clone(self.estimator) |
|
|
|
parallel = Parallel(n_jobs=self.n_jobs, pre_dispatch=self.pre_dispatch) |
|
|
|
fit_and_score_kwargs = dict( |
|
scorer=scorers, |
|
fit_params=routed_params.estimator.fit, |
|
score_params=routed_params.scorer.score, |
|
return_train_score=self.return_train_score, |
|
return_n_test_samples=True, |
|
return_times=True, |
|
return_parameters=False, |
|
error_score=self.error_score, |
|
verbose=self.verbose, |
|
) |
|
results = {} |
|
with parallel: |
|
all_candidate_params = [] |
|
all_out = [] |
|
all_more_results = defaultdict(list) |
|
|
|
def evaluate_candidates(candidate_params, cv=None, more_results=None): |
|
cv = cv or cv_orig |
|
candidate_params = list(candidate_params) |
|
n_candidates = len(candidate_params) |
|
|
|
if self.verbose > 0: |
|
print( |
|
"Fitting {0} folds for each of {1} candidates," |
|
" totalling {2} fits".format( |
|
n_splits, n_candidates, n_candidates * n_splits |
|
) |
|
) |
|
|
|
out = parallel( |
|
delayed(_fit_and_score)( |
|
clone(base_estimator), |
|
X, |
|
y, |
|
train=train, |
|
test=test, |
|
parameters=parameters, |
|
split_progress=(split_idx, n_splits), |
|
candidate_progress=(cand_idx, n_candidates), |
|
**fit_and_score_kwargs, |
|
) |
|
for (cand_idx, parameters), (split_idx, (train, test)) in product( |
|
enumerate(candidate_params), |
|
enumerate(cv.split(X, y, **routed_params.splitter.split)), |
|
) |
|
) |
|
|
|
if len(out) < 1: |
|
raise ValueError( |
|
"No fits were performed. " |
|
"Was the CV iterator empty? " |
|
"Were there no candidates?" |
|
) |
|
elif len(out) != n_candidates * n_splits: |
|
raise ValueError( |
|
"cv.split and cv.get_n_splits returned " |
|
"inconsistent results. Expected {} " |
|
"splits, got {}".format(n_splits, len(out) // n_candidates) |
|
) |
|
|
|
_warn_or_raise_about_fit_failures(out, self.error_score) |
|
|
|
|
|
|
|
|
|
|
|
if callable(self.scoring): |
|
_insert_error_scores(out, self.error_score) |
|
|
|
all_candidate_params.extend(candidate_params) |
|
all_out.extend(out) |
|
|
|
if more_results is not None: |
|
for key, value in more_results.items(): |
|
all_more_results[key].extend(value) |
|
|
|
nonlocal results |
|
results = self._format_results( |
|
all_candidate_params, n_splits, all_out, all_more_results |
|
) |
|
|
|
return results |
|
|
|
self._run_search(evaluate_candidates) |
|
|
|
|
|
|
|
first_test_score = all_out[0]["test_scores"] |
|
self.multimetric_ = isinstance(first_test_score, dict) |
|
|
|
|
|
if callable(self.scoring) and self.multimetric_: |
|
self._check_refit_for_multimetric(first_test_score) |
|
refit_metric = self.refit |
|
|
|
|
|
|
|
|
|
if self.refit or not self.multimetric_: |
|
self.best_index_ = self._select_best_index( |
|
self.refit, refit_metric, results |
|
) |
|
if not callable(self.refit): |
|
|
|
|
|
self.best_score_ = results[f"mean_test_{refit_metric}"][ |
|
self.best_index_ |
|
] |
|
self.best_params_ = results["params"][self.best_index_] |
|
|
|
if self.refit: |
|
|
|
|
|
|
|
|
|
self.best_estimator_ = clone(base_estimator).set_params( |
|
**clone(self.best_params_, safe=False) |
|
) |
|
|
|
refit_start_time = time.time() |
|
if y is not None: |
|
self.best_estimator_.fit(X, y, **routed_params.estimator.fit) |
|
else: |
|
self.best_estimator_.fit(X, **routed_params.estimator.fit) |
|
refit_end_time = time.time() |
|
self.refit_time_ = refit_end_time - refit_start_time |
|
|
|
if hasattr(self.best_estimator_, "feature_names_in_"): |
|
self.feature_names_in_ = self.best_estimator_.feature_names_in_ |
|
|
|
|
|
if isinstance(scorers, _MultimetricScorer): |
|
self.scorer_ = scorers._scorers |
|
else: |
|
self.scorer_ = scorers |
|
|
|
self.cv_results_ = results |
|
self.n_splits_ = n_splits |
|
|
|
return self |
|
|
|
def _format_results(self, candidate_params, n_splits, out, more_results=None): |
|
n_candidates = len(candidate_params) |
|
out = _aggregate_score_dicts(out) |
|
|
|
results = dict(more_results or {}) |
|
for key, val in results.items(): |
|
|
|
|
|
results[key] = np.asarray(val) |
|
|
|
def _store(key_name, array, weights=None, splits=False, rank=False): |
|
"""A small helper to store the scores/times to the cv_results_""" |
|
|
|
|
|
array = np.array(array, dtype=np.float64).reshape(n_candidates, n_splits) |
|
if splits: |
|
for split_idx in range(n_splits): |
|
|
|
results["split%d_%s" % (split_idx, key_name)] = array[:, split_idx] |
|
|
|
array_means = np.average(array, axis=1, weights=weights) |
|
results["mean_%s" % key_name] = array_means |
|
|
|
if key_name.startswith(("train_", "test_")) and np.any( |
|
~np.isfinite(array_means) |
|
): |
|
warnings.warn( |
|
( |
|
f"One or more of the {key_name.split('_')[0]} scores " |
|
f"are non-finite: {array_means}" |
|
), |
|
category=UserWarning, |
|
) |
|
|
|
|
|
array_stds = np.sqrt( |
|
np.average( |
|
(array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights |
|
) |
|
) |
|
results["std_%s" % key_name] = array_stds |
|
|
|
if rank: |
|
|
|
|
|
|
|
if np.isnan(array_means).all(): |
|
|
|
rank_result = np.ones_like(array_means, dtype=np.int32) |
|
else: |
|
min_array_means = np.nanmin(array_means) - 1 |
|
array_means = np.nan_to_num(array_means, nan=min_array_means) |
|
rank_result = rankdata(-array_means, method="min").astype( |
|
np.int32, copy=False |
|
) |
|
results["rank_%s" % key_name] = rank_result |
|
|
|
_store("fit_time", out["fit_time"]) |
|
_store("score_time", out["score_time"]) |
|
|
|
for param, ma in _yield_masked_array_for_each_param(candidate_params): |
|
results[param] = ma |
|
results["params"] = candidate_params |
|
|
|
test_scores_dict = _normalize_score_results(out["test_scores"]) |
|
if self.return_train_score: |
|
train_scores_dict = _normalize_score_results(out["train_scores"]) |
|
|
|
for scorer_name in test_scores_dict: |
|
|
|
_store( |
|
"test_%s" % scorer_name, |
|
test_scores_dict[scorer_name], |
|
splits=True, |
|
rank=True, |
|
weights=None, |
|
) |
|
if self.return_train_score: |
|
_store( |
|
"train_%s" % scorer_name, |
|
train_scores_dict[scorer_name], |
|
splits=True, |
|
) |
|
|
|
return results |
|
|
|
def get_metadata_routing(self): |
|
"""Get metadata routing of this object. |
|
|
|
Please check :ref:`User Guide <metadata_routing>` on how the routing |
|
mechanism works. |
|
|
|
.. versionadded:: 1.4 |
|
|
|
Returns |
|
------- |
|
routing : MetadataRouter |
|
A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating |
|
routing information. |
|
""" |
|
router = MetadataRouter(owner=self.__class__.__name__) |
|
router.add( |
|
estimator=self.estimator, |
|
method_mapping=MethodMapping().add(caller="fit", callee="fit"), |
|
) |
|
|
|
scorer, _ = self._get_scorers() |
|
router.add( |
|
scorer=scorer, |
|
method_mapping=MethodMapping() |
|
.add(caller="score", callee="score") |
|
.add(caller="fit", callee="score"), |
|
) |
|
router.add( |
|
splitter=self.cv, |
|
method_mapping=MethodMapping().add(caller="fit", callee="split"), |
|
) |
|
return router |
|
|
|
def _sk_visual_block_(self): |
|
if hasattr(self, "best_estimator_"): |
|
key, estimator = "best_estimator_", self.best_estimator_ |
|
else: |
|
key, estimator = "estimator", self.estimator |
|
|
|
return _VisualBlock( |
|
"parallel", |
|
[estimator], |
|
names=[f"{key}: {estimator.__class__.__name__}"], |
|
name_details=[str(estimator)], |
|
) |
|
|
|
|
|
class GridSearchCV(BaseSearchCV): |
|
"""Exhaustive search over specified parameter values for an estimator. |
|
|
|
Important members are fit, predict. |
|
|
|
GridSearchCV implements a "fit" and a "score" method. |
|
It also implements "score_samples", "predict", "predict_proba", |
|
"decision_function", "transform" and "inverse_transform" if they are |
|
implemented in the estimator used. |
|
|
|
The parameters of the estimator used to apply these methods are optimized |
|
by cross-validated grid-search over a parameter grid. |
|
|
|
Read more in the :ref:`User Guide <grid_search>`. |
|
|
|
Parameters |
|
---------- |
|
estimator : estimator object |
|
This is assumed to implement the scikit-learn estimator interface. |
|
Either estimator needs to provide a ``score`` function, |
|
or ``scoring`` must be passed. |
|
|
|
param_grid : dict or list of dictionaries |
|
Dictionary with parameters names (`str`) as keys and lists of |
|
parameter settings to try as values, or a list of such |
|
dictionaries, in which case the grids spanned by each dictionary |
|
in the list are explored. This enables searching over any sequence |
|
of parameter settings. |
|
|
|
scoring : str, callable, list, tuple or dict, default=None |
|
Strategy to evaluate the performance of the cross-validated model on |
|
the test set. |
|
|
|
If `scoring` represents a single score, one can use: |
|
|
|
- a single string (see :ref:`scoring_parameter`); |
|
- a callable (see :ref:`scoring_callable`) that returns a single value. |
|
|
|
If `scoring` represents multiple scores, one can use: |
|
|
|
- a list or tuple of unique strings; |
|
- a callable returning a dictionary where the keys are the metric |
|
names and the values are the metric scores; |
|
- a dictionary with metric names as keys and callables as values. |
|
|
|
See :ref:`multimetric_grid_search` for an example. |
|
|
|
n_jobs : int, default=None |
|
Number of jobs to run in parallel. |
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. |
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>` |
|
for more details. |
|
|
|
.. versionchanged:: v0.20 |
|
`n_jobs` default changed from 1 to None |
|
|
|
refit : bool, str, or callable, default=True |
|
Refit an estimator using the best found parameters on the whole |
|
dataset. |
|
|
|
For multiple metric evaluation, this needs to be a `str` denoting the |
|
scorer that would be used to find the best parameters for refitting |
|
the estimator at the end. |
|
|
|
Where there are considerations other than maximum score in |
|
choosing a best estimator, ``refit`` can be set to a function which |
|
returns the selected ``best_index_`` given ``cv_results_``. In that |
|
case, the ``best_estimator_`` and ``best_params_`` will be set |
|
according to the returned ``best_index_`` while the ``best_score_`` |
|
attribute will not be available. |
|
|
|
The refitted estimator is made available at the ``best_estimator_`` |
|
attribute and permits using ``predict`` directly on this |
|
``GridSearchCV`` instance. |
|
|
|
Also for multiple metric evaluation, the attributes ``best_index_``, |
|
``best_score_`` and ``best_params_`` will only be available if |
|
``refit`` is set and all of them will be determined w.r.t this specific |
|
scorer. |
|
|
|
See ``scoring`` parameter to know more about multiple metric |
|
evaluation. |
|
|
|
See :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_digits.py` |
|
to see how to design a custom selection strategy using a callable |
|
via `refit`. |
|
|
|
.. versionchanged:: 0.20 |
|
Support for callable added. |
|
|
|
cv : int, cross-validation generator or an iterable, default=None |
|
Determines the cross-validation splitting strategy. |
|
Possible inputs for cv are: |
|
|
|
- None, to use the default 5-fold cross validation, |
|
- integer, to specify the number of folds in a `(Stratified)KFold`, |
|
- :term:`CV splitter`, |
|
- An iterable yielding (train, test) splits as arrays of indices. |
|
|
|
For integer/None inputs, if the estimator is a classifier and ``y`` is |
|
either binary or multiclass, :class:`StratifiedKFold` is used. In all |
|
other cases, :class:`KFold` is used. These splitters are instantiated |
|
with `shuffle=False` so the splits will be the same across calls. |
|
|
|
Refer :ref:`User Guide <cross_validation>` for the various |
|
cross-validation strategies that can be used here. |
|
|
|
.. versionchanged:: 0.22 |
|
``cv`` default value if None changed from 3-fold to 5-fold. |
|
|
|
verbose : int |
|
Controls the verbosity: the higher, the more messages. |
|
|
|
- >1 : the computation time for each fold and parameter candidate is |
|
displayed; |
|
- >2 : the score is also displayed; |
|
- >3 : the fold and candidate parameter indexes are also displayed |
|
together with the starting time of the computation. |
|
|
|
pre_dispatch : int, or str, default='2*n_jobs' |
|
Controls the number of jobs that get dispatched during parallel |
|
execution. Reducing this number can be useful to avoid an |
|
explosion of memory consumption when more jobs get dispatched |
|
than CPUs can process. This parameter can be: |
|
|
|
- None, in which case all the jobs are immediately created and spawned. Use |
|
this for lightweight and fast-running jobs, to avoid delays due to on-demand |
|
spawning of the jobs |
|
- An int, giving the exact number of total jobs that are spawned |
|
- A str, giving an expression as a function of n_jobs, as in '2*n_jobs' |
|
|
|
error_score : 'raise' or numeric, default=np.nan |
|
Value to assign to the score if an error occurs in estimator fitting. |
|
If set to 'raise', the error is raised. If a numeric value is given, |
|
FitFailedWarning is raised. This parameter does not affect the refit |
|
step, which will always raise the error. |
|
|
|
return_train_score : bool, default=False |
|
If ``False``, the ``cv_results_`` attribute will not include training |
|
scores. |
|
Computing training scores is used to get insights on how different |
|
parameter settings impact the overfitting/underfitting trade-off. |
|
However computing the scores on the training set can be computationally |
|
expensive and is not strictly required to select the parameters that |
|
yield the best generalization performance. |
|
|
|
.. versionadded:: 0.19 |
|
|
|
.. versionchanged:: 0.21 |
|
Default value was changed from ``True`` to ``False`` |
|
|
|
Attributes |
|
---------- |
|
cv_results_ : dict of numpy (masked) ndarrays |
|
A dict with keys as column headers and values as columns, that can be |
|
imported into a pandas ``DataFrame``. |
|
|
|
For instance the below given table |
|
|
|
+------------+-----------+------------+-----------------+---+---------+ |
|
|param_kernel|param_gamma|param_degree|split0_test_score|...|rank_t...| |
|
+============+===========+============+=================+===+=========+ |
|
| 'poly' | -- | 2 | 0.80 |...| 2 | |
|
+------------+-----------+------------+-----------------+---+---------+ |
|
| 'poly' | -- | 3 | 0.70 |...| 4 | |
|
+------------+-----------+------------+-----------------+---+---------+ |
|
| 'rbf' | 0.1 | -- | 0.80 |...| 3 | |
|
+------------+-----------+------------+-----------------+---+---------+ |
|
| 'rbf' | 0.2 | -- | 0.93 |...| 1 | |
|
+------------+-----------+------------+-----------------+---+---------+ |
|
|
|
will be represented by a ``cv_results_`` dict of:: |
|
|
|
{ |
|
'param_kernel': masked_array(data = ['poly', 'poly', 'rbf', 'rbf'], |
|
mask = [False False False False]...) |
|
'param_gamma': masked_array(data = [-- -- 0.1 0.2], |
|
mask = [ True True False False]...), |
|
'param_degree': masked_array(data = [2.0 3.0 -- --], |
|
mask = [False False True True]...), |
|
'split0_test_score' : [0.80, 0.70, 0.80, 0.93], |
|
'split1_test_score' : [0.82, 0.50, 0.70, 0.78], |
|
'mean_test_score' : [0.81, 0.60, 0.75, 0.85], |
|
'std_test_score' : [0.01, 0.10, 0.05, 0.08], |
|
'rank_test_score' : [2, 4, 3, 1], |
|
'split0_train_score' : [0.80, 0.92, 0.70, 0.93], |
|
'split1_train_score' : [0.82, 0.55, 0.70, 0.87], |
|
'mean_train_score' : [0.81, 0.74, 0.70, 0.90], |
|
'std_train_score' : [0.01, 0.19, 0.00, 0.03], |
|
'mean_fit_time' : [0.73, 0.63, 0.43, 0.49], |
|
'std_fit_time' : [0.01, 0.02, 0.01, 0.01], |
|
'mean_score_time' : [0.01, 0.06, 0.04, 0.04], |
|
'std_score_time' : [0.00, 0.00, 0.00, 0.01], |
|
'params' : [{'kernel': 'poly', 'degree': 2}, ...], |
|
} |
|
|
|
NOTE |
|
|
|
The key ``'params'`` is used to store a list of parameter |
|
settings dicts for all the parameter candidates. |
|
|
|
The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and |
|
``std_score_time`` are all in seconds. |
|
|
|
For multi-metric evaluation, the scores for all the scorers are |
|
available in the ``cv_results_`` dict at the keys ending with that |
|
scorer's name (``'_<scorer_name>'``) instead of ``'_score'`` shown |
|
above. ('split0_test_precision', 'mean_train_precision' etc.) |
|
|
|
best_estimator_ : estimator |
|
Estimator that was chosen by the search, i.e. estimator |
|
which gave highest score (or smallest loss if specified) |
|
on the left out data. Not available if ``refit=False``. |
|
|
|
See ``refit`` parameter for more information on allowed values. |
|
|
|
best_score_ : float |
|
Mean cross-validated score of the best_estimator |
|
|
|
For multi-metric evaluation, this is present only if ``refit`` is |
|
specified. |
|
|
|
This attribute is not available if ``refit`` is a function. |
|
|
|
best_params_ : dict |
|
Parameter setting that gave the best results on the hold out data. |
|
|
|
For multi-metric evaluation, this is present only if ``refit`` is |
|
specified. |
|
|
|
best_index_ : int |
|
The index (of the ``cv_results_`` arrays) which corresponds to the best |
|
candidate parameter setting. |
|
|
|
The dict at ``search.cv_results_['params'][search.best_index_]`` gives |
|
the parameter setting for the best model, that gives the highest |
|
mean score (``search.best_score_``). |
|
|
|
For multi-metric evaluation, this is present only if ``refit`` is |
|
specified. |
|
|
|
scorer_ : function or a dict |
|
Scorer function used on the held out data to choose the best |
|
parameters for the model. |
|
|
|
For multi-metric evaluation, this attribute holds the validated |
|
``scoring`` dict which maps the scorer key to the scorer callable. |
|
|
|
n_splits_ : int |
|
The number of cross-validation splits (folds/iterations). |
|
|
|
refit_time_ : float |
|
Seconds used for refitting the best model on the whole dataset. |
|
|
|
This is present only if ``refit`` is not False. |
|
|
|
.. versionadded:: 0.20 |
|
|
|
multimetric_ : bool |
|
Whether or not the scorers compute several metrics. |
|
|
|
classes_ : ndarray of shape (n_classes,) |
|
The classes labels. This is present only if ``refit`` is specified and |
|
the underlying estimator is a classifier. |
|
|
|
n_features_in_ : int |
|
Number of features seen during :term:`fit`. Only defined if |
|
`best_estimator_` is defined (see the documentation for the `refit` |
|
parameter for more details) and that `best_estimator_` exposes |
|
`n_features_in_` when fit. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
feature_names_in_ : ndarray of shape (`n_features_in_`,) |
|
Names of features seen during :term:`fit`. Only defined if |
|
`best_estimator_` is defined (see the documentation for the `refit` |
|
parameter for more details) and that `best_estimator_` exposes |
|
`feature_names_in_` when fit. |
|
|
|
.. versionadded:: 1.0 |
|
|
|
See Also |
|
-------- |
|
ParameterGrid : Generates all the combinations of a hyperparameter grid. |
|
train_test_split : Utility function to split the data into a development |
|
set usable for fitting a GridSearchCV instance and an evaluation set |
|
for its final evaluation. |
|
sklearn.metrics.make_scorer : Make a scorer from a performance metric or |
|
loss function. |
|
|
|
Notes |
|
----- |
|
The parameters selected are those that maximize the score of the left out |
|
data, unless an explicit score is passed in which case it is used instead. |
|
|
|
If `n_jobs` was set to a value higher than one, the data is copied for each |
|
point in the grid (and not `n_jobs` times). This is done for efficiency |
|
reasons if individual jobs take very little time, but may raise errors if |
|
the dataset is large and not enough memory is available. A workaround in |
|
this case is to set `pre_dispatch`. Then, the memory is copied only |
|
`pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 * |
|
n_jobs`. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn import svm, datasets |
|
>>> from sklearn.model_selection import GridSearchCV |
|
>>> iris = datasets.load_iris() |
|
>>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]} |
|
>>> svc = svm.SVC() |
|
>>> clf = GridSearchCV(svc, parameters) |
|
>>> clf.fit(iris.data, iris.target) |
|
GridSearchCV(estimator=SVC(), |
|
param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')}) |
|
>>> sorted(clf.cv_results_.keys()) |
|
['mean_fit_time', 'mean_score_time', 'mean_test_score',... |
|
'param_C', 'param_kernel', 'params',... |
|
'rank_test_score', 'split0_test_score',... |
|
'split2_test_score', ... |
|
'std_fit_time', 'std_score_time', 'std_test_score'] |
|
""" |
|
|
|
_parameter_constraints: dict = { |
|
**BaseSearchCV._parameter_constraints, |
|
"param_grid": [dict, list], |
|
} |
|
|
|
def __init__( |
|
self, |
|
estimator, |
|
param_grid, |
|
*, |
|
scoring=None, |
|
n_jobs=None, |
|
refit=True, |
|
cv=None, |
|
verbose=0, |
|
pre_dispatch="2*n_jobs", |
|
error_score=np.nan, |
|
return_train_score=False, |
|
): |
|
super().__init__( |
|
estimator=estimator, |
|
scoring=scoring, |
|
n_jobs=n_jobs, |
|
refit=refit, |
|
cv=cv, |
|
verbose=verbose, |
|
pre_dispatch=pre_dispatch, |
|
error_score=error_score, |
|
return_train_score=return_train_score, |
|
) |
|
self.param_grid = param_grid |
|
|
|
def _run_search(self, evaluate_candidates): |
|
"""Search all candidates in param_grid""" |
|
evaluate_candidates(ParameterGrid(self.param_grid)) |
|
|
|
|
|
class RandomizedSearchCV(BaseSearchCV): |
|
"""Randomized search on hyper parameters. |
|
|
|
RandomizedSearchCV implements a "fit" and a "score" method. |
|
It also implements "score_samples", "predict", "predict_proba", |
|
"decision_function", "transform" and "inverse_transform" if they are |
|
implemented in the estimator used. |
|
|
|
The parameters of the estimator used to apply these methods are optimized |
|
by cross-validated search over parameter settings. |
|
|
|
In contrast to GridSearchCV, not all parameter values are tried out, but |
|
rather a fixed number of parameter settings is sampled from the specified |
|
distributions. The number of parameter settings that are tried is |
|
given by n_iter. |
|
|
|
If all parameters are presented as a list, |
|
sampling without replacement is performed. If at least one parameter |
|
is given as a distribution, sampling with replacement is used. |
|
It is highly recommended to use continuous distributions for continuous |
|
parameters. |
|
|
|
Read more in the :ref:`User Guide <randomized_parameter_search>`. |
|
|
|
.. versionadded:: 0.14 |
|
|
|
Parameters |
|
---------- |
|
estimator : estimator object |
|
An object of that type is instantiated for each grid point. |
|
This is assumed to implement the scikit-learn estimator interface. |
|
Either estimator needs to provide a ``score`` function, |
|
or ``scoring`` must be passed. |
|
|
|
param_distributions : dict or list of dicts |
|
Dictionary with parameters names (`str`) as keys and distributions |
|
or lists of parameters to try. Distributions must provide a ``rvs`` |
|
method for sampling (such as those from scipy.stats.distributions). |
|
If a list is given, it is sampled uniformly. |
|
If a list of dicts is given, first a dict is sampled uniformly, and |
|
then a parameter is sampled using that dict as above. |
|
|
|
n_iter : int, default=10 |
|
Number of parameter settings that are sampled. n_iter trades |
|
off runtime vs quality of the solution. |
|
|
|
scoring : str, callable, list, tuple or dict, default=None |
|
Strategy to evaluate the performance of the cross-validated model on |
|
the test set. |
|
|
|
If `scoring` represents a single score, one can use: |
|
|
|
- a single string (see :ref:`scoring_parameter`); |
|
- a callable (see :ref:`scoring_callable`) that returns a single value. |
|
|
|
If `scoring` represents multiple scores, one can use: |
|
|
|
- a list or tuple of unique strings; |
|
- a callable returning a dictionary where the keys are the metric |
|
names and the values are the metric scores; |
|
- a dictionary with metric names as keys and callables as values. |
|
|
|
See :ref:`multimetric_grid_search` for an example. |
|
|
|
If None, the estimator's score method is used. |
|
|
|
n_jobs : int, default=None |
|
Number of jobs to run in parallel. |
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. |
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>` |
|
for more details. |
|
|
|
.. versionchanged:: v0.20 |
|
`n_jobs` default changed from 1 to None |
|
|
|
refit : bool, str, or callable, default=True |
|
Refit an estimator using the best found parameters on the whole |
|
dataset. |
|
|
|
For multiple metric evaluation, this needs to be a `str` denoting the |
|
scorer that would be used to find the best parameters for refitting |
|
the estimator at the end. |
|
|
|
Where there are considerations other than maximum score in |
|
choosing a best estimator, ``refit`` can be set to a function which |
|
returns the selected ``best_index_`` given the ``cv_results_``. In that |
|
case, the ``best_estimator_`` and ``best_params_`` will be set |
|
according to the returned ``best_index_`` while the ``best_score_`` |
|
attribute will not be available. |
|
|
|
The refitted estimator is made available at the ``best_estimator_`` |
|
attribute and permits using ``predict`` directly on this |
|
``RandomizedSearchCV`` instance. |
|
|
|
Also for multiple metric evaluation, the attributes ``best_index_``, |
|
``best_score_`` and ``best_params_`` will only be available if |
|
``refit`` is set and all of them will be determined w.r.t this specific |
|
scorer. |
|
|
|
See ``scoring`` parameter to know more about multiple metric |
|
evaluation. |
|
|
|
.. versionchanged:: 0.20 |
|
Support for callable added. |
|
|
|
cv : int, cross-validation generator or an iterable, default=None |
|
Determines the cross-validation splitting strategy. |
|
Possible inputs for cv are: |
|
|
|
- None, to use the default 5-fold cross validation, |
|
- integer, to specify the number of folds in a `(Stratified)KFold`, |
|
- :term:`CV splitter`, |
|
- An iterable yielding (train, test) splits as arrays of indices. |
|
|
|
For integer/None inputs, if the estimator is a classifier and ``y`` is |
|
either binary or multiclass, :class:`StratifiedKFold` is used. In all |
|
other cases, :class:`KFold` is used. These splitters are instantiated |
|
with `shuffle=False` so the splits will be the same across calls. |
|
|
|
Refer :ref:`User Guide <cross_validation>` for the various |
|
cross-validation strategies that can be used here. |
|
|
|
.. versionchanged:: 0.22 |
|
``cv`` default value if None changed from 3-fold to 5-fold. |
|
|
|
verbose : int |
|
Controls the verbosity: the higher, the more messages. |
|
|
|
- >1 : the computation time for each fold and parameter candidate is |
|
displayed; |
|
- >2 : the score is also displayed; |
|
- >3 : the fold and candidate parameter indexes are also displayed |
|
together with the starting time of the computation. |
|
|
|
pre_dispatch : int, or str, default='2*n_jobs' |
|
Controls the number of jobs that get dispatched during parallel |
|
execution. Reducing this number can be useful to avoid an |
|
explosion of memory consumption when more jobs get dispatched |
|
than CPUs can process. This parameter can be: |
|
|
|
- None, in which case all the jobs are immediately created and spawned. Use |
|
this for lightweight and fast-running jobs, to avoid delays due to on-demand |
|
spawning of the jobs |
|
- An int, giving the exact number of total jobs that are spawned |
|
- A str, giving an expression as a function of n_jobs, as in '2*n_jobs' |
|
|
|
random_state : int, RandomState instance or None, default=None |
|
Pseudo random number generator state used for random uniform sampling |
|
from lists of possible values instead of scipy.stats distributions. |
|
Pass an int for reproducible output across multiple |
|
function calls. |
|
See :term:`Glossary <random_state>`. |
|
|
|
error_score : 'raise' or numeric, default=np.nan |
|
Value to assign to the score if an error occurs in estimator fitting. |
|
If set to 'raise', the error is raised. If a numeric value is given, |
|
FitFailedWarning is raised. This parameter does not affect the refit |
|
step, which will always raise the error. |
|
|
|
return_train_score : bool, default=False |
|
If ``False``, the ``cv_results_`` attribute will not include training |
|
scores. |
|
Computing training scores is used to get insights on how different |
|
parameter settings impact the overfitting/underfitting trade-off. |
|
However computing the scores on the training set can be computationally |
|
expensive and is not strictly required to select the parameters that |
|
yield the best generalization performance. |
|
|
|
.. versionadded:: 0.19 |
|
|
|
.. versionchanged:: 0.21 |
|
Default value was changed from ``True`` to ``False`` |
|
|
|
Attributes |
|
---------- |
|
cv_results_ : dict of numpy (masked) ndarrays |
|
A dict with keys as column headers and values as columns, that can be |
|
imported into a pandas ``DataFrame``. |
|
|
|
For instance the below given table |
|
|
|
+--------------+-------------+-------------------+---+---------------+ |
|
| param_kernel | param_gamma | split0_test_score |...|rank_test_score| |
|
+==============+=============+===================+===+===============+ |
|
| 'rbf' | 0.1 | 0.80 |...| 1 | |
|
+--------------+-------------+-------------------+---+---------------+ |
|
| 'rbf' | 0.2 | 0.84 |...| 3 | |
|
+--------------+-------------+-------------------+---+---------------+ |
|
| 'rbf' | 0.3 | 0.70 |...| 2 | |
|
+--------------+-------------+-------------------+---+---------------+ |
|
|
|
will be represented by a ``cv_results_`` dict of:: |
|
|
|
{ |
|
'param_kernel' : masked_array(data = ['rbf', 'rbf', 'rbf'], |
|
mask = False), |
|
'param_gamma' : masked_array(data = [0.1 0.2 0.3], mask = False), |
|
'split0_test_score' : [0.80, 0.84, 0.70], |
|
'split1_test_score' : [0.82, 0.50, 0.70], |
|
'mean_test_score' : [0.81, 0.67, 0.70], |
|
'std_test_score' : [0.01, 0.24, 0.00], |
|
'rank_test_score' : [1, 3, 2], |
|
'split0_train_score' : [0.80, 0.92, 0.70], |
|
'split1_train_score' : [0.82, 0.55, 0.70], |
|
'mean_train_score' : [0.81, 0.74, 0.70], |
|
'std_train_score' : [0.01, 0.19, 0.00], |
|
'mean_fit_time' : [0.73, 0.63, 0.43], |
|
'std_fit_time' : [0.01, 0.02, 0.01], |
|
'mean_score_time' : [0.01, 0.06, 0.04], |
|
'std_score_time' : [0.00, 0.00, 0.00], |
|
'params' : [{'kernel' : 'rbf', 'gamma' : 0.1}, ...], |
|
} |
|
|
|
NOTE |
|
|
|
The key ``'params'`` is used to store a list of parameter |
|
settings dicts for all the parameter candidates. |
|
|
|
The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and |
|
``std_score_time`` are all in seconds. |
|
|
|
For multi-metric evaluation, the scores for all the scorers are |
|
available in the ``cv_results_`` dict at the keys ending with that |
|
scorer's name (``'_<scorer_name>'``) instead of ``'_score'`` shown |
|
above. ('split0_test_precision', 'mean_train_precision' etc.) |
|
|
|
best_estimator_ : estimator |
|
Estimator that was chosen by the search, i.e. estimator |
|
which gave highest score (or smallest loss if specified) |
|
on the left out data. Not available if ``refit=False``. |
|
|
|
For multi-metric evaluation, this attribute is present only if |
|
``refit`` is specified. |
|
|
|
See ``refit`` parameter for more information on allowed values. |
|
|
|
best_score_ : float |
|
Mean cross-validated score of the best_estimator. |
|
|
|
For multi-metric evaluation, this is not available if ``refit`` is |
|
``False``. See ``refit`` parameter for more information. |
|
|
|
This attribute is not available if ``refit`` is a function. |
|
|
|
best_params_ : dict |
|
Parameter setting that gave the best results on the hold out data. |
|
|
|
For multi-metric evaluation, this is not available if ``refit`` is |
|
``False``. See ``refit`` parameter for more information. |
|
|
|
best_index_ : int |
|
The index (of the ``cv_results_`` arrays) which corresponds to the best |
|
candidate parameter setting. |
|
|
|
The dict at ``search.cv_results_['params'][search.best_index_]`` gives |
|
the parameter setting for the best model, that gives the highest |
|
mean score (``search.best_score_``). |
|
|
|
For multi-metric evaluation, this is not available if ``refit`` is |
|
``False``. See ``refit`` parameter for more information. |
|
|
|
scorer_ : function or a dict |
|
Scorer function used on the held out data to choose the best |
|
parameters for the model. |
|
|
|
For multi-metric evaluation, this attribute holds the validated |
|
``scoring`` dict which maps the scorer key to the scorer callable. |
|
|
|
n_splits_ : int |
|
The number of cross-validation splits (folds/iterations). |
|
|
|
refit_time_ : float |
|
Seconds used for refitting the best model on the whole dataset. |
|
|
|
This is present only if ``refit`` is not False. |
|
|
|
.. versionadded:: 0.20 |
|
|
|
multimetric_ : bool |
|
Whether or not the scorers compute several metrics. |
|
|
|
classes_ : ndarray of shape (n_classes,) |
|
The classes labels. This is present only if ``refit`` is specified and |
|
the underlying estimator is a classifier. |
|
|
|
n_features_in_ : int |
|
Number of features seen during :term:`fit`. Only defined if |
|
`best_estimator_` is defined (see the documentation for the `refit` |
|
parameter for more details) and that `best_estimator_` exposes |
|
`n_features_in_` when fit. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
feature_names_in_ : ndarray of shape (`n_features_in_`,) |
|
Names of features seen during :term:`fit`. Only defined if |
|
`best_estimator_` is defined (see the documentation for the `refit` |
|
parameter for more details) and that `best_estimator_` exposes |
|
`feature_names_in_` when fit. |
|
|
|
.. versionadded:: 1.0 |
|
|
|
See Also |
|
-------- |
|
GridSearchCV : Does exhaustive search over a grid of parameters. |
|
ParameterSampler : A generator over parameter settings, constructed from |
|
param_distributions. |
|
|
|
Notes |
|
----- |
|
The parameters selected are those that maximize the score of the held-out |
|
data, according to the scoring parameter. |
|
|
|
If `n_jobs` was set to a value higher than one, the data is copied for each |
|
parameter setting(and not `n_jobs` times). This is done for efficiency |
|
reasons if individual jobs take very little time, but may raise errors if |
|
the dataset is large and not enough memory is available. A workaround in |
|
this case is to set `pre_dispatch`. Then, the memory is copied only |
|
`pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 * |
|
n_jobs`. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.datasets import load_iris |
|
>>> from sklearn.linear_model import LogisticRegression |
|
>>> from sklearn.model_selection import RandomizedSearchCV |
|
>>> from scipy.stats import uniform |
|
>>> iris = load_iris() |
|
>>> logistic = LogisticRegression(solver='saga', tol=1e-2, max_iter=200, |
|
... random_state=0) |
|
>>> distributions = dict(C=uniform(loc=0, scale=4), |
|
... penalty=['l2', 'l1']) |
|
>>> clf = RandomizedSearchCV(logistic, distributions, random_state=0) |
|
>>> search = clf.fit(iris.data, iris.target) |
|
>>> search.best_params_ |
|
{'C': np.float64(2...), 'penalty': 'l1'} |
|
""" |
|
|
|
_parameter_constraints: dict = { |
|
**BaseSearchCV._parameter_constraints, |
|
"param_distributions": [dict, list], |
|
"n_iter": [Interval(numbers.Integral, 1, None, closed="left")], |
|
"random_state": ["random_state"], |
|
} |
|
|
|
def __init__( |
|
self, |
|
estimator, |
|
param_distributions, |
|
*, |
|
n_iter=10, |
|
scoring=None, |
|
n_jobs=None, |
|
refit=True, |
|
cv=None, |
|
verbose=0, |
|
pre_dispatch="2*n_jobs", |
|
random_state=None, |
|
error_score=np.nan, |
|
return_train_score=False, |
|
): |
|
self.param_distributions = param_distributions |
|
self.n_iter = n_iter |
|
self.random_state = random_state |
|
super().__init__( |
|
estimator=estimator, |
|
scoring=scoring, |
|
n_jobs=n_jobs, |
|
refit=refit, |
|
cv=cv, |
|
verbose=verbose, |
|
pre_dispatch=pre_dispatch, |
|
error_score=error_score, |
|
return_train_score=return_train_score, |
|
) |
|
|
|
def _run_search(self, evaluate_candidates): |
|
"""Search n_iter candidates from param_distributions""" |
|
evaluate_candidates( |
|
ParameterSampler( |
|
self.param_distributions, self.n_iter, random_state=self.random_state |
|
) |
|
) |
|
|