|
""" |
|
The :mod:`sklearn.model_selection._split` module includes classes and |
|
functions to split the data based on a preset strategy. |
|
""" |
|
|
|
|
|
|
|
|
|
import numbers |
|
import warnings |
|
from abc import ABCMeta, abstractmethod |
|
from collections import defaultdict |
|
from collections.abc import Iterable |
|
from inspect import signature |
|
from itertools import chain, combinations |
|
from math import ceil, floor |
|
|
|
import numpy as np |
|
from scipy.special import comb |
|
|
|
from ..utils import ( |
|
_safe_indexing, |
|
check_random_state, |
|
indexable, |
|
metadata_routing, |
|
) |
|
from ..utils._array_api import ( |
|
_convert_to_numpy, |
|
ensure_common_namespace_device, |
|
get_namespace, |
|
) |
|
from ..utils._param_validation import Interval, RealNotInt, validate_params |
|
from ..utils.extmath import _approximate_mode |
|
from ..utils.metadata_routing import _MetadataRequester |
|
from ..utils.multiclass import type_of_target |
|
from ..utils.validation import _num_samples, check_array, column_or_1d |
|
|
|
__all__ = [ |
|
"BaseCrossValidator", |
|
"KFold", |
|
"GroupKFold", |
|
"LeaveOneGroupOut", |
|
"LeaveOneOut", |
|
"LeavePGroupsOut", |
|
"LeavePOut", |
|
"RepeatedStratifiedKFold", |
|
"RepeatedKFold", |
|
"ShuffleSplit", |
|
"GroupShuffleSplit", |
|
"StratifiedKFold", |
|
"StratifiedGroupKFold", |
|
"StratifiedShuffleSplit", |
|
"PredefinedSplit", |
|
"train_test_split", |
|
"check_cv", |
|
] |
|
|
|
|
|
class _UnsupportedGroupCVMixin: |
|
"""Mixin for splitters that do not support Groups.""" |
|
|
|
def split(self, X, y=None, groups=None): |
|
"""Generate indices to split data into training and test set. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features) |
|
Training data, where `n_samples` is the number of samples |
|
and `n_features` is the number of features. |
|
|
|
y : array-like of shape (n_samples,) |
|
The target variable for supervised learning problems. |
|
|
|
groups : object |
|
Always ignored, exists for compatibility. |
|
|
|
Yields |
|
------ |
|
train : ndarray |
|
The training set indices for that split. |
|
|
|
test : ndarray |
|
The testing set indices for that split. |
|
""" |
|
if groups is not None: |
|
warnings.warn( |
|
f"The groups parameter is ignored by {self.__class__.__name__}", |
|
UserWarning, |
|
) |
|
return super().split(X, y, groups=groups) |
|
|
|
|
|
class GroupsConsumerMixin(_MetadataRequester): |
|
"""A Mixin to ``groups`` by default. |
|
|
|
This Mixin makes the object to request ``groups`` by default as ``True``. |
|
|
|
.. versionadded:: 1.3 |
|
""" |
|
|
|
__metadata_request__split = {"groups": True} |
|
|
|
|
|
class BaseCrossValidator(_MetadataRequester, metaclass=ABCMeta): |
|
"""Base class for all cross-validators. |
|
|
|
Implementations must define `_iter_test_masks` or `_iter_test_indices`. |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
__metadata_request__split = {"groups": metadata_routing.UNUSED} |
|
|
|
def split(self, X, y=None, groups=None): |
|
"""Generate indices to split data into training and test set. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features) |
|
Training data, where `n_samples` is the number of samples |
|
and `n_features` is the number of features. |
|
|
|
y : array-like of shape (n_samples,) |
|
The target variable for supervised learning problems. |
|
|
|
groups : array-like of shape (n_samples,), default=None |
|
Group labels for the samples used while splitting the dataset into |
|
train/test set. |
|
|
|
Yields |
|
------ |
|
train : ndarray |
|
The training set indices for that split. |
|
|
|
test : ndarray |
|
The testing set indices for that split. |
|
""" |
|
X, y, groups = indexable(X, y, groups) |
|
indices = np.arange(_num_samples(X)) |
|
for test_index in self._iter_test_masks(X, y, groups): |
|
train_index = indices[np.logical_not(test_index)] |
|
test_index = indices[test_index] |
|
yield train_index, test_index |
|
|
|
|
|
|
|
def _iter_test_masks(self, X=None, y=None, groups=None): |
|
"""Generates boolean masks corresponding to test sets. |
|
|
|
By default, delegates to _iter_test_indices(X, y, groups) |
|
""" |
|
for test_index in self._iter_test_indices(X, y, groups): |
|
test_mask = np.zeros(_num_samples(X), dtype=bool) |
|
test_mask[test_index] = True |
|
yield test_mask |
|
|
|
def _iter_test_indices(self, X=None, y=None, groups=None): |
|
"""Generates integer indices corresponding to test sets.""" |
|
raise NotImplementedError |
|
|
|
@abstractmethod |
|
def get_n_splits(self, X=None, y=None, groups=None): |
|
"""Returns the number of splitting iterations in the cross-validator.""" |
|
|
|
def __repr__(self): |
|
return _build_repr(self) |
|
|
|
|
|
class LeaveOneOut(_UnsupportedGroupCVMixin, BaseCrossValidator): |
|
"""Leave-One-Out cross-validator. |
|
|
|
Provides train/test indices to split data in train/test sets. Each |
|
sample is used once as a test set (singleton) while the remaining |
|
samples form the training set. |
|
|
|
Note: ``LeaveOneOut()`` is equivalent to ``KFold(n_splits=n)`` and |
|
``LeavePOut(p=1)`` where ``n`` is the number of samples. |
|
|
|
Due to the high number of test sets (which is the same as the |
|
number of samples) this cross-validation method can be very costly. |
|
For large datasets one should favor :class:`KFold`, :class:`ShuffleSplit` |
|
or :class:`StratifiedKFold`. |
|
|
|
Read more in the :ref:`User Guide <leave_one_out>`. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.model_selection import LeaveOneOut |
|
>>> X = np.array([[1, 2], [3, 4]]) |
|
>>> y = np.array([1, 2]) |
|
>>> loo = LeaveOneOut() |
|
>>> loo.get_n_splits(X) |
|
2 |
|
>>> print(loo) |
|
LeaveOneOut() |
|
>>> for i, (train_index, test_index) in enumerate(loo.split(X)): |
|
... print(f"Fold {i}:") |
|
... print(f" Train: index={train_index}") |
|
... print(f" Test: index={test_index}") |
|
Fold 0: |
|
Train: index=[1] |
|
Test: index=[0] |
|
Fold 1: |
|
Train: index=[0] |
|
Test: index=[1] |
|
|
|
See Also |
|
-------- |
|
LeaveOneGroupOut : For splitting the data according to explicit, |
|
domain-specific stratification of the dataset. |
|
GroupKFold : K-fold iterator variant with non-overlapping groups. |
|
""" |
|
|
|
def _iter_test_indices(self, X, y=None, groups=None): |
|
n_samples = _num_samples(X) |
|
if n_samples <= 1: |
|
raise ValueError( |
|
"Cannot perform LeaveOneOut with n_samples={}.".format(n_samples) |
|
) |
|
return range(n_samples) |
|
|
|
def get_n_splits(self, X, y=None, groups=None): |
|
"""Returns the number of splitting iterations in the cross-validator. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features) |
|
Training data, where `n_samples` is the number of samples |
|
and `n_features` is the number of features. |
|
|
|
y : object |
|
Always ignored, exists for compatibility. |
|
|
|
groups : object |
|
Always ignored, exists for compatibility. |
|
|
|
Returns |
|
------- |
|
n_splits : int |
|
Returns the number of splitting iterations in the cross-validator. |
|
""" |
|
if X is None: |
|
raise ValueError("The 'X' parameter should not be None.") |
|
return _num_samples(X) |
|
|
|
|
|
class LeavePOut(_UnsupportedGroupCVMixin, BaseCrossValidator): |
|
"""Leave-P-Out cross-validator. |
|
|
|
Provides train/test indices to split data in train/test sets. This results |
|
in testing on all distinct samples of size p, while the remaining n - p |
|
samples form the training set in each iteration. |
|
|
|
Note: ``LeavePOut(p)`` is NOT equivalent to |
|
``KFold(n_splits=n_samples // p)`` which creates non-overlapping test sets. |
|
|
|
Due to the high number of iterations which grows combinatorically with the |
|
number of samples this cross-validation method can be very costly. For |
|
large datasets one should favor :class:`KFold`, :class:`StratifiedKFold` |
|
or :class:`ShuffleSplit`. |
|
|
|
Read more in the :ref:`User Guide <leave_p_out>`. |
|
|
|
Parameters |
|
---------- |
|
p : int |
|
Size of the test sets. Must be strictly less than the number of |
|
samples. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.model_selection import LeavePOut |
|
>>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) |
|
>>> y = np.array([1, 2, 3, 4]) |
|
>>> lpo = LeavePOut(2) |
|
>>> lpo.get_n_splits(X) |
|
6 |
|
>>> print(lpo) |
|
LeavePOut(p=2) |
|
>>> for i, (train_index, test_index) in enumerate(lpo.split(X)): |
|
... print(f"Fold {i}:") |
|
... print(f" Train: index={train_index}") |
|
... print(f" Test: index={test_index}") |
|
Fold 0: |
|
Train: index=[2 3] |
|
Test: index=[0 1] |
|
Fold 1: |
|
Train: index=[1 3] |
|
Test: index=[0 2] |
|
Fold 2: |
|
Train: index=[1 2] |
|
Test: index=[0 3] |
|
Fold 3: |
|
Train: index=[0 3] |
|
Test: index=[1 2] |
|
Fold 4: |
|
Train: index=[0 2] |
|
Test: index=[1 3] |
|
Fold 5: |
|
Train: index=[0 1] |
|
Test: index=[2 3] |
|
""" |
|
|
|
def __init__(self, p): |
|
self.p = p |
|
|
|
def _iter_test_indices(self, X, y=None, groups=None): |
|
n_samples = _num_samples(X) |
|
if n_samples <= self.p: |
|
raise ValueError( |
|
"p={} must be strictly less than the number of samples={}".format( |
|
self.p, n_samples |
|
) |
|
) |
|
for combination in combinations(range(n_samples), self.p): |
|
yield np.array(combination) |
|
|
|
def get_n_splits(self, X, y=None, groups=None): |
|
"""Returns the number of splitting iterations in the cross-validator. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features) |
|
Training data, where `n_samples` is the number of samples |
|
and `n_features` is the number of features. |
|
|
|
y : object |
|
Always ignored, exists for compatibility. |
|
|
|
groups : object |
|
Always ignored, exists for compatibility. |
|
""" |
|
if X is None: |
|
raise ValueError("The 'X' parameter should not be None.") |
|
return int(comb(_num_samples(X), self.p, exact=True)) |
|
|
|
|
|
class _BaseKFold(BaseCrossValidator, metaclass=ABCMeta): |
|
"""Base class for K-Fold cross-validators and TimeSeriesSplit.""" |
|
|
|
@abstractmethod |
|
def __init__(self, n_splits, *, shuffle, random_state): |
|
if not isinstance(n_splits, numbers.Integral): |
|
raise ValueError( |
|
"The number of folds must be of Integral type. " |
|
"%s of type %s was passed." % (n_splits, type(n_splits)) |
|
) |
|
n_splits = int(n_splits) |
|
|
|
if n_splits <= 1: |
|
raise ValueError( |
|
"k-fold cross-validation requires at least one" |
|
" train/test split by setting n_splits=2 or more," |
|
" got n_splits={0}.".format(n_splits) |
|
) |
|
|
|
if not isinstance(shuffle, bool): |
|
raise TypeError("shuffle must be True or False; got {0}".format(shuffle)) |
|
|
|
if not shuffle and random_state is not None: |
|
raise ValueError( |
|
( |
|
"Setting a random_state has no effect since shuffle is " |
|
"False. You should leave " |
|
"random_state to its default (None), or set shuffle=True." |
|
), |
|
) |
|
|
|
self.n_splits = n_splits |
|
self.shuffle = shuffle |
|
self.random_state = random_state |
|
|
|
def split(self, X, y=None, groups=None): |
|
"""Generate indices to split data into training and test set. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features) |
|
Training data, where `n_samples` is the number of samples |
|
and `n_features` is the number of features. |
|
|
|
y : array-like of shape (n_samples,), default=None |
|
The target variable for supervised learning problems. |
|
|
|
groups : array-like of shape (n_samples,), default=None |
|
Group labels for the samples used while splitting the dataset into |
|
train/test set. |
|
|
|
Yields |
|
------ |
|
train : ndarray |
|
The training set indices for that split. |
|
|
|
test : ndarray |
|
The testing set indices for that split. |
|
""" |
|
X, y, groups = indexable(X, y, groups) |
|
n_samples = _num_samples(X) |
|
if self.n_splits > n_samples: |
|
raise ValueError( |
|
( |
|
"Cannot have number of splits n_splits={0} greater" |
|
" than the number of samples: n_samples={1}." |
|
).format(self.n_splits, n_samples) |
|
) |
|
|
|
for train, test in super().split(X, y, groups): |
|
yield train, test |
|
|
|
def get_n_splits(self, X=None, y=None, groups=None): |
|
"""Returns the number of splitting iterations in the cross-validator. |
|
|
|
Parameters |
|
---------- |
|
X : object |
|
Always ignored, exists for compatibility. |
|
|
|
y : object |
|
Always ignored, exists for compatibility. |
|
|
|
groups : object |
|
Always ignored, exists for compatibility. |
|
|
|
Returns |
|
------- |
|
n_splits : int |
|
Returns the number of splitting iterations in the cross-validator. |
|
""" |
|
return self.n_splits |
|
|
|
|
|
class KFold(_UnsupportedGroupCVMixin, _BaseKFold): |
|
"""K-Fold cross-validator. |
|
|
|
Provides train/test indices to split data in train/test sets. Split |
|
dataset into k consecutive folds (without shuffling by default). |
|
|
|
Each fold is then used once as a validation while the k - 1 remaining |
|
folds form the training set. |
|
|
|
Read more in the :ref:`User Guide <k_fold>`. |
|
|
|
For visualisation of cross-validation behaviour and |
|
comparison between common scikit-learn split methods |
|
refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` |
|
|
|
Parameters |
|
---------- |
|
n_splits : int, default=5 |
|
Number of folds. Must be at least 2. |
|
|
|
.. versionchanged:: 0.22 |
|
``n_splits`` default value changed from 3 to 5. |
|
|
|
shuffle : bool, default=False |
|
Whether to shuffle the data before splitting into batches. |
|
Note that the samples within each split will not be shuffled. |
|
|
|
random_state : int, RandomState instance or None, default=None |
|
When `shuffle` is True, `random_state` affects the ordering of the |
|
indices, which controls the randomness of each fold. Otherwise, this |
|
parameter has no effect. |
|
Pass an int for reproducible output across multiple function calls. |
|
See :term:`Glossary <random_state>`. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.model_selection import KFold |
|
>>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) |
|
>>> y = np.array([1, 2, 3, 4]) |
|
>>> kf = KFold(n_splits=2) |
|
>>> kf.get_n_splits(X) |
|
2 |
|
>>> print(kf) |
|
KFold(n_splits=2, random_state=None, shuffle=False) |
|
>>> for i, (train_index, test_index) in enumerate(kf.split(X)): |
|
... print(f"Fold {i}:") |
|
... print(f" Train: index={train_index}") |
|
... print(f" Test: index={test_index}") |
|
Fold 0: |
|
Train: index=[2 3] |
|
Test: index=[0 1] |
|
Fold 1: |
|
Train: index=[0 1] |
|
Test: index=[2 3] |
|
|
|
Notes |
|
----- |
|
The first ``n_samples % n_splits`` folds have size |
|
``n_samples // n_splits + 1``, other folds have size |
|
``n_samples // n_splits``, where ``n_samples`` is the number of samples. |
|
|
|
Randomized CV splitters may return different results for each call of |
|
split. You can make the results identical by setting `random_state` |
|
to an integer. |
|
|
|
See Also |
|
-------- |
|
StratifiedKFold : Takes class information into account to avoid building |
|
folds with imbalanced class distributions (for binary or multiclass |
|
classification tasks). |
|
|
|
GroupKFold : K-fold iterator variant with non-overlapping groups. |
|
|
|
RepeatedKFold : Repeats K-Fold n times. |
|
""" |
|
|
|
def __init__(self, n_splits=5, *, shuffle=False, random_state=None): |
|
super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state) |
|
|
|
def _iter_test_indices(self, X, y=None, groups=None): |
|
n_samples = _num_samples(X) |
|
indices = np.arange(n_samples) |
|
if self.shuffle: |
|
check_random_state(self.random_state).shuffle(indices) |
|
|
|
n_splits = self.n_splits |
|
fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=int) |
|
fold_sizes[: n_samples % n_splits] += 1 |
|
current = 0 |
|
for fold_size in fold_sizes: |
|
start, stop = current, current + fold_size |
|
yield indices[start:stop] |
|
current = stop |
|
|
|
|
|
class GroupKFold(GroupsConsumerMixin, _BaseKFold): |
|
"""K-fold iterator variant with non-overlapping groups. |
|
|
|
Each group will appear exactly once in the test set across all folds (the |
|
number of distinct groups has to be at least equal to the number of folds). |
|
|
|
The folds are approximately balanced in the sense that the number of |
|
samples is approximately the same in each test fold when `shuffle` is True. |
|
|
|
Read more in the :ref:`User Guide <group_k_fold>`. |
|
|
|
For visualisation of cross-validation behaviour and |
|
comparison between common scikit-learn split methods |
|
refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` |
|
|
|
Parameters |
|
---------- |
|
n_splits : int, default=5 |
|
Number of folds. Must be at least 2. |
|
|
|
.. versionchanged:: 0.22 |
|
``n_splits`` default value changed from 3 to 5. |
|
|
|
shuffle : bool, default=False |
|
Whether to shuffle the groups before splitting into batches. |
|
Note that the samples within each split will not be shuffled. |
|
|
|
.. versionadded:: 1.6 |
|
|
|
random_state : int, RandomState instance or None, default=None |
|
When `shuffle` is True, `random_state` affects the ordering of the |
|
indices, which controls the randomness of each fold. Otherwise, this |
|
parameter has no effect. |
|
Pass an int for reproducible output across multiple function calls. |
|
See :term:`Glossary <random_state>`. |
|
|
|
.. versionadded:: 1.6 |
|
|
|
Notes |
|
----- |
|
Groups appear in an arbitrary order throughout the folds. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.model_selection import GroupKFold |
|
>>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]]) |
|
>>> y = np.array([1, 2, 3, 4, 5, 6]) |
|
>>> groups = np.array([0, 0, 2, 2, 3, 3]) |
|
>>> group_kfold = GroupKFold(n_splits=2) |
|
>>> group_kfold.get_n_splits(X, y, groups) |
|
2 |
|
>>> print(group_kfold) |
|
GroupKFold(n_splits=2, random_state=None, shuffle=False) |
|
>>> for i, (train_index, test_index) in enumerate(group_kfold.split(X, y, groups)): |
|
... print(f"Fold {i}:") |
|
... print(f" Train: index={train_index}, group={groups[train_index]}") |
|
... print(f" Test: index={test_index}, group={groups[test_index]}") |
|
Fold 0: |
|
Train: index=[2 3], group=[2 2] |
|
Test: index=[0 1 4 5], group=[0 0 3 3] |
|
Fold 1: |
|
Train: index=[0 1 4 5], group=[0 0 3 3] |
|
Test: index=[2 3], group=[2 2] |
|
|
|
See Also |
|
-------- |
|
LeaveOneGroupOut : For splitting the data according to explicit |
|
domain-specific stratification of the dataset. |
|
|
|
StratifiedKFold : Takes class information into account to avoid building |
|
folds with imbalanced class proportions (for binary or multiclass |
|
classification tasks). |
|
""" |
|
|
|
def __init__(self, n_splits=5, *, shuffle=False, random_state=None): |
|
super().__init__(n_splits, shuffle=shuffle, random_state=random_state) |
|
|
|
def _iter_test_indices(self, X, y, groups): |
|
if groups is None: |
|
raise ValueError("The 'groups' parameter should not be None.") |
|
groups = check_array(groups, input_name="groups", ensure_2d=False, dtype=None) |
|
|
|
unique_groups, group_idx = np.unique(groups, return_inverse=True) |
|
n_groups = len(unique_groups) |
|
|
|
if self.n_splits > n_groups: |
|
raise ValueError( |
|
"Cannot have number of splits n_splits=%d greater" |
|
" than the number of groups: %d." % (self.n_splits, n_groups) |
|
) |
|
|
|
if self.shuffle: |
|
|
|
rng = check_random_state(self.random_state) |
|
unique_groups = rng.permutation(unique_groups) |
|
split_groups = np.array_split(unique_groups, self.n_splits) |
|
|
|
for test_group_ids in split_groups: |
|
test_mask = np.isin(groups, test_group_ids) |
|
yield np.where(test_mask)[0] |
|
|
|
else: |
|
|
|
n_samples_per_group = np.bincount(group_idx) |
|
|
|
|
|
indices = np.argsort(n_samples_per_group)[::-1] |
|
n_samples_per_group = n_samples_per_group[indices] |
|
|
|
|
|
n_samples_per_fold = np.zeros(self.n_splits) |
|
|
|
|
|
group_to_fold = np.zeros(len(unique_groups)) |
|
|
|
|
|
for group_index, weight in enumerate(n_samples_per_group): |
|
lightest_fold = np.argmin(n_samples_per_fold) |
|
n_samples_per_fold[lightest_fold] += weight |
|
group_to_fold[indices[group_index]] = lightest_fold |
|
|
|
indices = group_to_fold[group_idx] |
|
|
|
for f in range(self.n_splits): |
|
yield np.where(indices == f)[0] |
|
|
|
def split(self, X, y=None, groups=None): |
|
"""Generate indices to split data into training and test set. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features) |
|
Training data, where `n_samples` is the number of samples |
|
and `n_features` is the number of features. |
|
|
|
y : array-like of shape (n_samples,), default=None |
|
The target variable for supervised learning problems. |
|
|
|
groups : array-like of shape (n_samples,) |
|
Group labels for the samples used while splitting the dataset into |
|
train/test set. |
|
|
|
Yields |
|
------ |
|
train : ndarray |
|
The training set indices for that split. |
|
|
|
test : ndarray |
|
The testing set indices for that split. |
|
""" |
|
return super().split(X, y, groups) |
|
|
|
|
|
class StratifiedKFold(_BaseKFold): |
|
"""Stratified K-Fold cross-validator. |
|
|
|
Provides train/test indices to split data in train/test sets. |
|
|
|
This cross-validation object is a variation of KFold that returns |
|
stratified folds. The folds are made by preserving the percentage of |
|
samples for each class. |
|
|
|
Read more in the :ref:`User Guide <stratified_k_fold>`. |
|
|
|
For visualisation of cross-validation behaviour and |
|
comparison between common scikit-learn split methods |
|
refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` |
|
|
|
Parameters |
|
---------- |
|
n_splits : int, default=5 |
|
Number of folds. Must be at least 2. |
|
|
|
.. versionchanged:: 0.22 |
|
``n_splits`` default value changed from 3 to 5. |
|
|
|
shuffle : bool, default=False |
|
Whether to shuffle each class's samples before splitting into batches. |
|
Note that the samples within each split will not be shuffled. |
|
|
|
random_state : int, RandomState instance or None, default=None |
|
When `shuffle` is True, `random_state` affects the ordering of the |
|
indices, which controls the randomness of each fold for each class. |
|
Otherwise, leave `random_state` as `None`. |
|
Pass an int for reproducible output across multiple function calls. |
|
See :term:`Glossary <random_state>`. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.model_selection import StratifiedKFold |
|
>>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) |
|
>>> y = np.array([0, 0, 1, 1]) |
|
>>> skf = StratifiedKFold(n_splits=2) |
|
>>> skf.get_n_splits(X, y) |
|
2 |
|
>>> print(skf) |
|
StratifiedKFold(n_splits=2, random_state=None, shuffle=False) |
|
>>> for i, (train_index, test_index) in enumerate(skf.split(X, y)): |
|
... print(f"Fold {i}:") |
|
... print(f" Train: index={train_index}") |
|
... print(f" Test: index={test_index}") |
|
Fold 0: |
|
Train: index=[1 3] |
|
Test: index=[0 2] |
|
Fold 1: |
|
Train: index=[0 2] |
|
Test: index=[1 3] |
|
|
|
Notes |
|
----- |
|
The implementation is designed to: |
|
|
|
* Generate test sets such that all contain the same distribution of |
|
classes, or as close as possible. |
|
* Be invariant to class label: relabelling ``y = ["Happy", "Sad"]`` to |
|
``y = [1, 0]`` should not change the indices generated. |
|
* Preserve order dependencies in the dataset ordering, when |
|
``shuffle=False``: all samples from class k in some test set were |
|
contiguous in y, or separated in y by samples from classes other than k. |
|
* Generate test sets where the smallest and largest differ by at most one |
|
sample. |
|
|
|
.. versionchanged:: 0.22 |
|
The previous implementation did not follow the last constraint. |
|
|
|
See Also |
|
-------- |
|
RepeatedStratifiedKFold : Repeats Stratified K-Fold n times. |
|
""" |
|
|
|
def __init__(self, n_splits=5, *, shuffle=False, random_state=None): |
|
super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state) |
|
|
|
def _make_test_folds(self, X, y=None): |
|
rng = check_random_state(self.random_state) |
|
|
|
|
|
|
|
|
|
xp, is_array_api = get_namespace(y) |
|
if is_array_api: |
|
y = _convert_to_numpy(y, xp) |
|
else: |
|
y = np.asarray(y) |
|
type_of_target_y = type_of_target(y) |
|
allowed_target_types = ("binary", "multiclass") |
|
if type_of_target_y not in allowed_target_types: |
|
raise ValueError( |
|
"Supported target types are: {}. Got {!r} instead.".format( |
|
allowed_target_types, type_of_target_y |
|
) |
|
) |
|
|
|
y = column_or_1d(y) |
|
|
|
_, y_idx, y_inv = np.unique(y, return_index=True, return_inverse=True) |
|
|
|
|
|
|
|
_, class_perm = np.unique(y_idx, return_inverse=True) |
|
y_encoded = class_perm[y_inv] |
|
|
|
n_classes = len(y_idx) |
|
y_counts = np.bincount(y_encoded) |
|
min_groups = np.min(y_counts) |
|
if np.all(self.n_splits > y_counts): |
|
raise ValueError( |
|
"n_splits=%d cannot be greater than the" |
|
" number of members in each class." % (self.n_splits) |
|
) |
|
if self.n_splits > min_groups: |
|
warnings.warn( |
|
"The least populated class in y has only %d" |
|
" members, which is less than n_splits=%d." |
|
% (min_groups, self.n_splits), |
|
UserWarning, |
|
) |
|
|
|
|
|
|
|
|
|
y_order = np.sort(y_encoded) |
|
allocation = np.asarray( |
|
[ |
|
np.bincount(y_order[i :: self.n_splits], minlength=n_classes) |
|
for i in range(self.n_splits) |
|
] |
|
) |
|
|
|
|
|
|
|
|
|
test_folds = np.empty(len(y), dtype="i") |
|
for k in range(n_classes): |
|
|
|
|
|
|
|
folds_for_class = np.arange(self.n_splits).repeat(allocation[:, k]) |
|
if self.shuffle: |
|
rng.shuffle(folds_for_class) |
|
test_folds[y_encoded == k] = folds_for_class |
|
return test_folds |
|
|
|
def _iter_test_masks(self, X, y=None, groups=None): |
|
test_folds = self._make_test_folds(X, y) |
|
for i in range(self.n_splits): |
|
yield test_folds == i |
|
|
|
def split(self, X, y, groups=None): |
|
"""Generate indices to split data into training and test set. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features) |
|
Training data, where `n_samples` is the number of samples |
|
and `n_features` is the number of features. |
|
|
|
Note that providing ``y`` is sufficient to generate the splits and |
|
hence ``np.zeros(n_samples)`` may be used as a placeholder for |
|
``X`` instead of actual training data. |
|
|
|
y : array-like of shape (n_samples,) |
|
The target variable for supervised learning problems. |
|
Stratification is done based on the y labels. |
|
|
|
groups : object |
|
Always ignored, exists for compatibility. |
|
|
|
Yields |
|
------ |
|
train : ndarray |
|
The training set indices for that split. |
|
|
|
test : ndarray |
|
The testing set indices for that split. |
|
|
|
Notes |
|
----- |
|
Randomized CV splitters may return different results for each call of |
|
split. You can make the results identical by setting `random_state` |
|
to an integer. |
|
""" |
|
if groups is not None: |
|
warnings.warn( |
|
f"The groups parameter is ignored by {self.__class__.__name__}", |
|
UserWarning, |
|
) |
|
y = check_array(y, input_name="y", ensure_2d=False, dtype=None) |
|
return super().split(X, y, groups) |
|
|
|
|
|
class StratifiedGroupKFold(GroupsConsumerMixin, _BaseKFold): |
|
"""Stratified K-Fold iterator variant with non-overlapping groups. |
|
|
|
This cross-validation object is a variation of StratifiedKFold attempts to |
|
return stratified folds with non-overlapping groups. The folds are made by |
|
preserving the percentage of samples for each class. |
|
|
|
Each group will appear exactly once in the test set across all folds (the |
|
number of distinct groups has to be at least equal to the number of folds). |
|
|
|
The difference between :class:`GroupKFold` |
|
and `StratifiedGroupKFold` is that |
|
the former attempts to create balanced folds such that the number of |
|
distinct groups is approximately the same in each fold, whereas |
|
`StratifiedGroupKFold` attempts to create folds which preserve the |
|
percentage of samples for each class as much as possible given the |
|
constraint of non-overlapping groups between splits. |
|
|
|
Read more in the :ref:`User Guide <stratified_group_k_fold>`. |
|
|
|
For visualisation of cross-validation behaviour and |
|
comparison between common scikit-learn split methods |
|
refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` |
|
|
|
Parameters |
|
---------- |
|
n_splits : int, default=5 |
|
Number of folds. Must be at least 2. |
|
|
|
shuffle : bool, default=False |
|
Whether to shuffle each class's samples before splitting into batches. |
|
Note that the samples within each split will not be shuffled. |
|
This implementation can only shuffle groups that have approximately the |
|
same y distribution, no global shuffle will be performed. |
|
|
|
random_state : int or RandomState instance, default=None |
|
When `shuffle` is True, `random_state` affects the ordering of the |
|
indices, which controls the randomness of each fold for each class. |
|
Otherwise, leave `random_state` as `None`. |
|
Pass an int for reproducible output across multiple function calls. |
|
See :term:`Glossary <random_state>`. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.model_selection import StratifiedGroupKFold |
|
>>> X = np.ones((17, 2)) |
|
>>> y = np.array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]) |
|
>>> groups = np.array([1, 1, 2, 2, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 7, 8, 8]) |
|
>>> sgkf = StratifiedGroupKFold(n_splits=3) |
|
>>> sgkf.get_n_splits(X, y) |
|
3 |
|
>>> print(sgkf) |
|
StratifiedGroupKFold(n_splits=3, random_state=None, shuffle=False) |
|
>>> for i, (train_index, test_index) in enumerate(sgkf.split(X, y, groups)): |
|
... print(f"Fold {i}:") |
|
... print(f" Train: index={train_index}") |
|
... print(f" group={groups[train_index]}") |
|
... print(f" Test: index={test_index}") |
|
... print(f" group={groups[test_index]}") |
|
Fold 0: |
|
Train: index=[ 0 1 2 3 7 8 9 10 11 15 16] |
|
group=[1 1 2 2 4 5 5 5 5 8 8] |
|
Test: index=[ 4 5 6 12 13 14] |
|
group=[3 3 3 6 6 7] |
|
Fold 1: |
|
Train: index=[ 4 5 6 7 8 9 10 11 12 13 14] |
|
group=[3 3 3 4 5 5 5 5 6 6 7] |
|
Test: index=[ 0 1 2 3 15 16] |
|
group=[1 1 2 2 8 8] |
|
Fold 2: |
|
Train: index=[ 0 1 2 3 4 5 6 12 13 14 15 16] |
|
group=[1 1 2 2 3 3 3 6 6 7 8 8] |
|
Test: index=[ 7 8 9 10 11] |
|
group=[4 5 5 5 5] |
|
|
|
Notes |
|
----- |
|
The implementation is designed to: |
|
|
|
* Mimic the behavior of StratifiedKFold as much as possible for trivial |
|
groups (e.g. when each group contains only one sample). |
|
* Be invariant to class label: relabelling ``y = ["Happy", "Sad"]`` to |
|
``y = [1, 0]`` should not change the indices generated. |
|
* Stratify based on samples as much as possible while keeping |
|
non-overlapping groups constraint. That means that in some cases when |
|
there is a small number of groups containing a large number of samples |
|
the stratification will not be possible and the behavior will be close |
|
to GroupKFold. |
|
|
|
See also |
|
-------- |
|
StratifiedKFold: Takes class information into account to build folds which |
|
retain class distributions (for binary or multiclass classification |
|
tasks). |
|
|
|
GroupKFold: K-fold iterator variant with non-overlapping groups. |
|
""" |
|
|
|
def __init__(self, n_splits=5, shuffle=False, random_state=None): |
|
super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state) |
|
|
|
def _iter_test_indices(self, X, y, groups): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
rng = check_random_state(self.random_state) |
|
y = np.asarray(y) |
|
type_of_target_y = type_of_target(y) |
|
allowed_target_types = ("binary", "multiclass") |
|
if type_of_target_y not in allowed_target_types: |
|
raise ValueError( |
|
"Supported target types are: {}. Got {!r} instead.".format( |
|
allowed_target_types, type_of_target_y |
|
) |
|
) |
|
|
|
y = column_or_1d(y) |
|
_, y_inv, y_cnt = np.unique(y, return_inverse=True, return_counts=True) |
|
if np.all(self.n_splits > y_cnt): |
|
raise ValueError( |
|
"n_splits=%d cannot be greater than the" |
|
" number of members in each class." % (self.n_splits) |
|
) |
|
n_smallest_class = np.min(y_cnt) |
|
if self.n_splits > n_smallest_class: |
|
warnings.warn( |
|
"The least populated class in y has only %d" |
|
" members, which is less than n_splits=%d." |
|
% (n_smallest_class, self.n_splits), |
|
UserWarning, |
|
) |
|
n_classes = len(y_cnt) |
|
|
|
_, groups_inv, groups_cnt = np.unique( |
|
groups, return_inverse=True, return_counts=True |
|
) |
|
y_counts_per_group = np.zeros((len(groups_cnt), n_classes)) |
|
for class_idx, group_idx in zip(y_inv, groups_inv): |
|
y_counts_per_group[group_idx, class_idx] += 1 |
|
|
|
y_counts_per_fold = np.zeros((self.n_splits, n_classes)) |
|
groups_per_fold = defaultdict(set) |
|
|
|
if self.shuffle: |
|
rng.shuffle(y_counts_per_group) |
|
|
|
|
|
|
|
sorted_groups_idx = np.argsort( |
|
-np.std(y_counts_per_group, axis=1), kind="mergesort" |
|
) |
|
|
|
for group_idx in sorted_groups_idx: |
|
group_y_counts = y_counts_per_group[group_idx] |
|
best_fold = self._find_best_fold( |
|
y_counts_per_fold=y_counts_per_fold, |
|
y_cnt=y_cnt, |
|
group_y_counts=group_y_counts, |
|
) |
|
y_counts_per_fold[best_fold] += group_y_counts |
|
groups_per_fold[best_fold].add(group_idx) |
|
|
|
for i in range(self.n_splits): |
|
test_indices = [ |
|
idx |
|
for idx, group_idx in enumerate(groups_inv) |
|
if group_idx in groups_per_fold[i] |
|
] |
|
yield test_indices |
|
|
|
def _find_best_fold(self, y_counts_per_fold, y_cnt, group_y_counts): |
|
best_fold = None |
|
min_eval = np.inf |
|
min_samples_in_fold = np.inf |
|
for i in range(self.n_splits): |
|
y_counts_per_fold[i] += group_y_counts |
|
|
|
std_per_class = np.std(y_counts_per_fold / y_cnt.reshape(1, -1), axis=0) |
|
y_counts_per_fold[i] -= group_y_counts |
|
fold_eval = np.mean(std_per_class) |
|
samples_in_fold = np.sum(y_counts_per_fold[i]) |
|
is_current_fold_better = ( |
|
fold_eval < min_eval |
|
or np.isclose(fold_eval, min_eval) |
|
and samples_in_fold < min_samples_in_fold |
|
) |
|
if is_current_fold_better: |
|
min_eval = fold_eval |
|
min_samples_in_fold = samples_in_fold |
|
best_fold = i |
|
return best_fold |
|
|
|
|
|
class TimeSeriesSplit(_BaseKFold): |
|
"""Time Series cross-validator. |
|
|
|
Provides train/test indices to split time series data samples |
|
that are observed at fixed time intervals, in train/test sets. |
|
In each split, test indices must be higher than before, and thus shuffling |
|
in cross validator is inappropriate. |
|
|
|
This cross-validation object is a variation of :class:`KFold`. |
|
In the kth split, it returns first k folds as train set and the |
|
(k+1)th fold as test set. |
|
|
|
Note that unlike standard cross-validation methods, successive |
|
training sets are supersets of those that come before them. |
|
|
|
Read more in the :ref:`User Guide <time_series_split>`. |
|
|
|
For visualisation of cross-validation behaviour and |
|
comparison between common scikit-learn split methods |
|
refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` |
|
|
|
.. versionadded:: 0.18 |
|
|
|
Parameters |
|
---------- |
|
n_splits : int, default=5 |
|
Number of splits. Must be at least 2. |
|
|
|
.. versionchanged:: 0.22 |
|
``n_splits`` default value changed from 3 to 5. |
|
|
|
max_train_size : int, default=None |
|
Maximum size for a single training set. |
|
|
|
test_size : int, default=None |
|
Used to limit the size of the test set. Defaults to |
|
``n_samples // (n_splits + 1)``, which is the maximum allowed value |
|
with ``gap=0``. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
gap : int, default=0 |
|
Number of samples to exclude from the end of each train set before |
|
the test set. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.model_selection import TimeSeriesSplit |
|
>>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]]) |
|
>>> y = np.array([1, 2, 3, 4, 5, 6]) |
|
>>> tscv = TimeSeriesSplit() |
|
>>> print(tscv) |
|
TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None) |
|
>>> for i, (train_index, test_index) in enumerate(tscv.split(X)): |
|
... print(f"Fold {i}:") |
|
... print(f" Train: index={train_index}") |
|
... print(f" Test: index={test_index}") |
|
Fold 0: |
|
Train: index=[0] |
|
Test: index=[1] |
|
Fold 1: |
|
Train: index=[0 1] |
|
Test: index=[2] |
|
Fold 2: |
|
Train: index=[0 1 2] |
|
Test: index=[3] |
|
Fold 3: |
|
Train: index=[0 1 2 3] |
|
Test: index=[4] |
|
Fold 4: |
|
Train: index=[0 1 2 3 4] |
|
Test: index=[5] |
|
>>> # Fix test_size to 2 with 12 samples |
|
>>> X = np.random.randn(12, 2) |
|
>>> y = np.random.randint(0, 2, 12) |
|
>>> tscv = TimeSeriesSplit(n_splits=3, test_size=2) |
|
>>> for i, (train_index, test_index) in enumerate(tscv.split(X)): |
|
... print(f"Fold {i}:") |
|
... print(f" Train: index={train_index}") |
|
... print(f" Test: index={test_index}") |
|
Fold 0: |
|
Train: index=[0 1 2 3 4 5] |
|
Test: index=[6 7] |
|
Fold 1: |
|
Train: index=[0 1 2 3 4 5 6 7] |
|
Test: index=[8 9] |
|
Fold 2: |
|
Train: index=[0 1 2 3 4 5 6 7 8 9] |
|
Test: index=[10 11] |
|
>>> # Add in a 2 period gap |
|
>>> tscv = TimeSeriesSplit(n_splits=3, test_size=2, gap=2) |
|
>>> for i, (train_index, test_index) in enumerate(tscv.split(X)): |
|
... print(f"Fold {i}:") |
|
... print(f" Train: index={train_index}") |
|
... print(f" Test: index={test_index}") |
|
Fold 0: |
|
Train: index=[0 1 2 3] |
|
Test: index=[6 7] |
|
Fold 1: |
|
Train: index=[0 1 2 3 4 5] |
|
Test: index=[8 9] |
|
Fold 2: |
|
Train: index=[0 1 2 3 4 5 6 7] |
|
Test: index=[10 11] |
|
|
|
For a more extended example see |
|
:ref:`sphx_glr_auto_examples_applications_plot_cyclical_feature_engineering.py`. |
|
|
|
Notes |
|
----- |
|
The training set has size ``i * n_samples // (n_splits + 1) |
|
+ n_samples % (n_splits + 1)`` in the ``i`` th split, |
|
with a test set of size ``n_samples//(n_splits + 1)`` by default, |
|
where ``n_samples`` is the number of samples. Note that this |
|
formula is only valid when ``test_size`` and ``max_train_size`` are |
|
left to their default values. |
|
""" |
|
|
|
def __init__(self, n_splits=5, *, max_train_size=None, test_size=None, gap=0): |
|
super().__init__(n_splits, shuffle=False, random_state=None) |
|
self.max_train_size = max_train_size |
|
self.test_size = test_size |
|
self.gap = gap |
|
|
|
def split(self, X, y=None, groups=None): |
|
"""Generate indices to split data into training and test set. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features) |
|
Training data, where `n_samples` is the number of samples |
|
and `n_features` is the number of features. |
|
|
|
y : array-like of shape (n_samples,) |
|
Always ignored, exists for compatibility. |
|
|
|
groups : array-like of shape (n_samples,) |
|
Always ignored, exists for compatibility. |
|
|
|
Yields |
|
------ |
|
train : ndarray |
|
The training set indices for that split. |
|
|
|
test : ndarray |
|
The testing set indices for that split. |
|
""" |
|
if groups is not None: |
|
warnings.warn( |
|
f"The groups parameter is ignored by {self.__class__.__name__}", |
|
UserWarning, |
|
) |
|
return self._split(X) |
|
|
|
def _split(self, X): |
|
"""Generate indices to split data into training and test set. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features) |
|
Training data, where `n_samples` is the number of samples |
|
and `n_features` is the number of features. |
|
|
|
Yields |
|
------ |
|
train : ndarray |
|
The training set indices for that split. |
|
|
|
test : ndarray |
|
The testing set indices for that split. |
|
""" |
|
(X,) = indexable(X) |
|
n_samples = _num_samples(X) |
|
n_splits = self.n_splits |
|
n_folds = n_splits + 1 |
|
gap = self.gap |
|
test_size = ( |
|
self.test_size if self.test_size is not None else n_samples // n_folds |
|
) |
|
|
|
|
|
if n_folds > n_samples: |
|
raise ValueError( |
|
f"Cannot have number of folds={n_folds} greater" |
|
f" than the number of samples={n_samples}." |
|
) |
|
if n_samples - gap - (test_size * n_splits) <= 0: |
|
raise ValueError( |
|
f"Too many splits={n_splits} for number of samples" |
|
f"={n_samples} with test_size={test_size} and gap={gap}." |
|
) |
|
|
|
indices = np.arange(n_samples) |
|
test_starts = range(n_samples - n_splits * test_size, n_samples, test_size) |
|
|
|
for test_start in test_starts: |
|
train_end = test_start - gap |
|
if self.max_train_size and self.max_train_size < train_end: |
|
yield ( |
|
indices[train_end - self.max_train_size : train_end], |
|
indices[test_start : test_start + test_size], |
|
) |
|
else: |
|
yield ( |
|
indices[:train_end], |
|
indices[test_start : test_start + test_size], |
|
) |
|
|
|
|
|
class LeaveOneGroupOut(GroupsConsumerMixin, BaseCrossValidator): |
|
"""Leave One Group Out cross-validator. |
|
|
|
Provides train/test indices to split data such that each training set is |
|
comprised of all samples except ones belonging to one specific group. |
|
Arbitrary domain specific group information is provided as an array of integers |
|
that encodes the group of each sample. |
|
|
|
For instance the groups could be the year of collection of the samples |
|
and thus allow for cross-validation against time-based splits. |
|
|
|
Read more in the :ref:`User Guide <leave_one_group_out>`. |
|
|
|
Notes |
|
----- |
|
Splits are ordered according to the index of the group left out. The first |
|
split has testing set consisting of the group whose index in `groups` is |
|
lowest, and so on. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.model_selection import LeaveOneGroupOut |
|
>>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) |
|
>>> y = np.array([1, 2, 1, 2]) |
|
>>> groups = np.array([1, 1, 2, 2]) |
|
>>> logo = LeaveOneGroupOut() |
|
>>> logo.get_n_splits(X, y, groups) |
|
2 |
|
>>> logo.get_n_splits(groups=groups) # 'groups' is always required |
|
2 |
|
>>> print(logo) |
|
LeaveOneGroupOut() |
|
>>> for i, (train_index, test_index) in enumerate(logo.split(X, y, groups)): |
|
... print(f"Fold {i}:") |
|
... print(f" Train: index={train_index}, group={groups[train_index]}") |
|
... print(f" Test: index={test_index}, group={groups[test_index]}") |
|
Fold 0: |
|
Train: index=[2 3], group=[2 2] |
|
Test: index=[0 1], group=[1 1] |
|
Fold 1: |
|
Train: index=[0 1], group=[1 1] |
|
Test: index=[2 3], group=[2 2] |
|
|
|
See also |
|
-------- |
|
GroupKFold: K-fold iterator variant with non-overlapping groups. |
|
""" |
|
|
|
def _iter_test_masks(self, X, y, groups): |
|
if groups is None: |
|
raise ValueError("The 'groups' parameter should not be None.") |
|
|
|
groups = check_array( |
|
groups, input_name="groups", copy=True, ensure_2d=False, dtype=None |
|
) |
|
unique_groups = np.unique(groups) |
|
if len(unique_groups) <= 1: |
|
raise ValueError( |
|
"The groups parameter contains fewer than 2 unique groups " |
|
"(%s). LeaveOneGroupOut expects at least 2." % unique_groups |
|
) |
|
for i in unique_groups: |
|
yield groups == i |
|
|
|
def get_n_splits(self, X=None, y=None, groups=None): |
|
"""Returns the number of splitting iterations in the cross-validator. |
|
|
|
Parameters |
|
---------- |
|
X : object |
|
Always ignored, exists for compatibility. |
|
|
|
y : object |
|
Always ignored, exists for compatibility. |
|
|
|
groups : array-like of shape (n_samples,) |
|
Group labels for the samples used while splitting the dataset into |
|
train/test set. This 'groups' parameter must always be specified to |
|
calculate the number of splits, though the other parameters can be |
|
omitted. |
|
|
|
Returns |
|
------- |
|
n_splits : int |
|
Returns the number of splitting iterations in the cross-validator. |
|
""" |
|
if groups is None: |
|
raise ValueError("The 'groups' parameter should not be None.") |
|
groups = check_array(groups, input_name="groups", ensure_2d=False, dtype=None) |
|
return len(np.unique(groups)) |
|
|
|
def split(self, X, y=None, groups=None): |
|
"""Generate indices to split data into training and test set. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features) |
|
Training data, where `n_samples` is the number of samples |
|
and `n_features` is the number of features. |
|
|
|
y : array-like of shape (n_samples,), default=None |
|
The target variable for supervised learning problems. |
|
|
|
groups : array-like of shape (n_samples,) |
|
Group labels for the samples used while splitting the dataset into |
|
train/test set. |
|
|
|
Yields |
|
------ |
|
train : ndarray |
|
The training set indices for that split. |
|
|
|
test : ndarray |
|
The testing set indices for that split. |
|
""" |
|
return super().split(X, y, groups) |
|
|
|
|
|
class LeavePGroupsOut(GroupsConsumerMixin, BaseCrossValidator): |
|
"""Leave P Group(s) Out cross-validator. |
|
|
|
Provides train/test indices to split data according to a third-party |
|
provided group. This group information can be used to encode arbitrary |
|
domain specific stratifications of the samples as integers. |
|
|
|
For instance the groups could be the year of collection of the samples |
|
and thus allow for cross-validation against time-based splits. |
|
|
|
The difference between LeavePGroupsOut and LeaveOneGroupOut is that |
|
the former builds the test sets with all the samples assigned to |
|
``p`` different values of the groups while the latter uses samples |
|
all assigned the same groups. |
|
|
|
Read more in the :ref:`User Guide <leave_p_groups_out>`. |
|
|
|
Parameters |
|
---------- |
|
n_groups : int |
|
Number of groups (``p``) to leave out in the test split. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.model_selection import LeavePGroupsOut |
|
>>> X = np.array([[1, 2], [3, 4], [5, 6]]) |
|
>>> y = np.array([1, 2, 1]) |
|
>>> groups = np.array([1, 2, 3]) |
|
>>> lpgo = LeavePGroupsOut(n_groups=2) |
|
>>> lpgo.get_n_splits(X, y, groups) |
|
3 |
|
>>> lpgo.get_n_splits(groups=groups) # 'groups' is always required |
|
3 |
|
>>> print(lpgo) |
|
LeavePGroupsOut(n_groups=2) |
|
>>> for i, (train_index, test_index) in enumerate(lpgo.split(X, y, groups)): |
|
... print(f"Fold {i}:") |
|
... print(f" Train: index={train_index}, group={groups[train_index]}") |
|
... print(f" Test: index={test_index}, group={groups[test_index]}") |
|
Fold 0: |
|
Train: index=[2], group=[3] |
|
Test: index=[0 1], group=[1 2] |
|
Fold 1: |
|
Train: index=[1], group=[2] |
|
Test: index=[0 2], group=[1 3] |
|
Fold 2: |
|
Train: index=[0], group=[1] |
|
Test: index=[1 2], group=[2 3] |
|
|
|
See Also |
|
-------- |
|
GroupKFold : K-fold iterator variant with non-overlapping groups. |
|
""" |
|
|
|
def __init__(self, n_groups): |
|
self.n_groups = n_groups |
|
|
|
def _iter_test_masks(self, X, y, groups): |
|
if groups is None: |
|
raise ValueError("The 'groups' parameter should not be None.") |
|
groups = check_array( |
|
groups, input_name="groups", copy=True, ensure_2d=False, dtype=None |
|
) |
|
unique_groups = np.unique(groups) |
|
if self.n_groups >= len(unique_groups): |
|
raise ValueError( |
|
"The groups parameter contains fewer than (or equal to) " |
|
"n_groups (%d) numbers of unique groups (%s). LeavePGroupsOut " |
|
"expects that at least n_groups + 1 (%d) unique groups be " |
|
"present" % (self.n_groups, unique_groups, self.n_groups + 1) |
|
) |
|
combi = combinations(range(len(unique_groups)), self.n_groups) |
|
for indices in combi: |
|
test_index = np.zeros(_num_samples(X), dtype=bool) |
|
for l in unique_groups[np.array(indices)]: |
|
test_index[groups == l] = True |
|
yield test_index |
|
|
|
def get_n_splits(self, X=None, y=None, groups=None): |
|
"""Returns the number of splitting iterations in the cross-validator. |
|
|
|
Parameters |
|
---------- |
|
X : object |
|
Always ignored, exists for compatibility. |
|
|
|
y : object |
|
Always ignored, exists for compatibility. |
|
|
|
groups : array-like of shape (n_samples,) |
|
Group labels for the samples used while splitting the dataset into |
|
train/test set. This 'groups' parameter must always be specified to |
|
calculate the number of splits, though the other parameters can be |
|
omitted. |
|
|
|
Returns |
|
------- |
|
n_splits : int |
|
Returns the number of splitting iterations in the cross-validator. |
|
""" |
|
if groups is None: |
|
raise ValueError("The 'groups' parameter should not be None.") |
|
groups = check_array(groups, input_name="groups", ensure_2d=False, dtype=None) |
|
return int(comb(len(np.unique(groups)), self.n_groups, exact=True)) |
|
|
|
def split(self, X, y=None, groups=None): |
|
"""Generate indices to split data into training and test set. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features) |
|
Training data, where `n_samples` is the number of samples |
|
and `n_features` is the number of features. |
|
|
|
y : array-like of shape (n_samples,), default=None |
|
The target variable for supervised learning problems. |
|
|
|
groups : array-like of shape (n_samples,) |
|
Group labels for the samples used while splitting the dataset into |
|
train/test set. |
|
|
|
Yields |
|
------ |
|
train : ndarray |
|
The training set indices for that split. |
|
|
|
test : ndarray |
|
The testing set indices for that split. |
|
""" |
|
return super().split(X, y, groups) |
|
|
|
|
|
class _RepeatedSplits(_MetadataRequester, metaclass=ABCMeta): |
|
"""Repeated splits for an arbitrary randomized CV splitter. |
|
|
|
Repeats splits for cross-validators n times with different randomization |
|
in each repetition. |
|
|
|
Parameters |
|
---------- |
|
cv : callable |
|
Cross-validator class. |
|
|
|
n_repeats : int, default=10 |
|
Number of times cross-validator needs to be repeated. |
|
|
|
random_state : int, RandomState instance or None, default=None |
|
Passes `random_state` to the arbitrary repeating cross validator. |
|
Pass an int for reproducible output across multiple function calls. |
|
See :term:`Glossary <random_state>`. |
|
|
|
**cvargs : additional params |
|
Constructor parameters for cv. Must not contain random_state |
|
and shuffle. |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
__metadata_request__split = {"groups": metadata_routing.UNUSED} |
|
|
|
def __init__(self, cv, *, n_repeats=10, random_state=None, **cvargs): |
|
if not isinstance(n_repeats, numbers.Integral): |
|
raise ValueError("Number of repetitions must be of Integral type.") |
|
|
|
if n_repeats <= 0: |
|
raise ValueError("Number of repetitions must be greater than 0.") |
|
|
|
if any(key in cvargs for key in ("random_state", "shuffle")): |
|
raise ValueError("cvargs must not contain random_state or shuffle.") |
|
|
|
self.cv = cv |
|
self.n_repeats = n_repeats |
|
self.random_state = random_state |
|
self.cvargs = cvargs |
|
|
|
def split(self, X, y=None, groups=None): |
|
"""Generates indices to split data into training and test set. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features) |
|
Training data, where `n_samples` is the number of samples |
|
and `n_features` is the number of features. |
|
|
|
y : array-like of shape (n_samples,) |
|
The target variable for supervised learning problems. |
|
|
|
groups : array-like of shape (n_samples,), default=None |
|
Group labels for the samples used while splitting the dataset into |
|
train/test set. |
|
|
|
Yields |
|
------ |
|
train : ndarray |
|
The training set indices for that split. |
|
|
|
test : ndarray |
|
The testing set indices for that split. |
|
""" |
|
n_repeats = self.n_repeats |
|
rng = check_random_state(self.random_state) |
|
|
|
for idx in range(n_repeats): |
|
cv = self.cv(random_state=rng, shuffle=True, **self.cvargs) |
|
for train_index, test_index in cv.split(X, y, groups): |
|
yield train_index, test_index |
|
|
|
def get_n_splits(self, X=None, y=None, groups=None): |
|
"""Returns the number of splitting iterations in the cross-validator. |
|
|
|
Parameters |
|
---------- |
|
X : object |
|
Always ignored, exists for compatibility. |
|
``np.zeros(n_samples)`` may be used as a placeholder. |
|
|
|
y : object |
|
Always ignored, exists for compatibility. |
|
``np.zeros(n_samples)`` may be used as a placeholder. |
|
|
|
groups : array-like of shape (n_samples,), default=None |
|
Group labels for the samples used while splitting the dataset into |
|
train/test set. |
|
|
|
Returns |
|
------- |
|
n_splits : int |
|
Returns the number of splitting iterations in the cross-validator. |
|
""" |
|
rng = check_random_state(self.random_state) |
|
cv = self.cv(random_state=rng, shuffle=True, **self.cvargs) |
|
return cv.get_n_splits(X, y, groups) * self.n_repeats |
|
|
|
def __repr__(self): |
|
return _build_repr(self) |
|
|
|
|
|
class RepeatedKFold(_UnsupportedGroupCVMixin, _RepeatedSplits): |
|
"""Repeated K-Fold cross validator. |
|
|
|
Repeats K-Fold n times with different randomization in each repetition. |
|
|
|
Read more in the :ref:`User Guide <repeated_k_fold>`. |
|
|
|
Parameters |
|
---------- |
|
n_splits : int, default=5 |
|
Number of folds. Must be at least 2. |
|
|
|
n_repeats : int, default=10 |
|
Number of times cross-validator needs to be repeated. |
|
|
|
random_state : int, RandomState instance or None, default=None |
|
Controls the randomness of each repeated cross-validation instance. |
|
Pass an int for reproducible output across multiple function calls. |
|
See :term:`Glossary <random_state>`. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.model_selection import RepeatedKFold |
|
>>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) |
|
>>> y = np.array([0, 0, 1, 1]) |
|
>>> rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=2652124) |
|
>>> rkf.get_n_splits(X, y) |
|
4 |
|
>>> print(rkf) |
|
RepeatedKFold(n_repeats=2, n_splits=2, random_state=2652124) |
|
>>> for i, (train_index, test_index) in enumerate(rkf.split(X)): |
|
... print(f"Fold {i}:") |
|
... print(f" Train: index={train_index}") |
|
... print(f" Test: index={test_index}") |
|
... |
|
Fold 0: |
|
Train: index=[0 1] |
|
Test: index=[2 3] |
|
Fold 1: |
|
Train: index=[2 3] |
|
Test: index=[0 1] |
|
Fold 2: |
|
Train: index=[1 2] |
|
Test: index=[0 3] |
|
Fold 3: |
|
Train: index=[0 3] |
|
Test: index=[1 2] |
|
|
|
Notes |
|
----- |
|
Randomized CV splitters may return different results for each call of |
|
split. You can make the results identical by setting `random_state` |
|
to an integer. |
|
|
|
See Also |
|
-------- |
|
RepeatedStratifiedKFold : Repeats Stratified K-Fold n times. |
|
""" |
|
|
|
def __init__(self, *, n_splits=5, n_repeats=10, random_state=None): |
|
super().__init__( |
|
KFold, n_repeats=n_repeats, random_state=random_state, n_splits=n_splits |
|
) |
|
|
|
|
|
class RepeatedStratifiedKFold(_UnsupportedGroupCVMixin, _RepeatedSplits): |
|
"""Repeated Stratified K-Fold cross validator. |
|
|
|
Repeats Stratified K-Fold n times with different randomization in each |
|
repetition. |
|
|
|
Read more in the :ref:`User Guide <repeated_k_fold>`. |
|
|
|
Parameters |
|
---------- |
|
n_splits : int, default=5 |
|
Number of folds. Must be at least 2. |
|
|
|
n_repeats : int, default=10 |
|
Number of times cross-validator needs to be repeated. |
|
|
|
random_state : int, RandomState instance or None, default=None |
|
Controls the generation of the random states for each repetition. |
|
Pass an int for reproducible output across multiple function calls. |
|
See :term:`Glossary <random_state>`. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.model_selection import RepeatedStratifiedKFold |
|
>>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) |
|
>>> y = np.array([0, 0, 1, 1]) |
|
>>> rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=2, |
|
... random_state=36851234) |
|
>>> rskf.get_n_splits(X, y) |
|
4 |
|
>>> print(rskf) |
|
RepeatedStratifiedKFold(n_repeats=2, n_splits=2, random_state=36851234) |
|
>>> for i, (train_index, test_index) in enumerate(rskf.split(X, y)): |
|
... print(f"Fold {i}:") |
|
... print(f" Train: index={train_index}") |
|
... print(f" Test: index={test_index}") |
|
... |
|
Fold 0: |
|
Train: index=[1 2] |
|
Test: index=[0 3] |
|
Fold 1: |
|
Train: index=[0 3] |
|
Test: index=[1 2] |
|
Fold 2: |
|
Train: index=[1 3] |
|
Test: index=[0 2] |
|
Fold 3: |
|
Train: index=[0 2] |
|
Test: index=[1 3] |
|
|
|
Notes |
|
----- |
|
Randomized CV splitters may return different results for each call of |
|
split. You can make the results identical by setting `random_state` |
|
to an integer. |
|
|
|
See Also |
|
-------- |
|
RepeatedKFold : Repeats K-Fold n times. |
|
""" |
|
|
|
def __init__(self, *, n_splits=5, n_repeats=10, random_state=None): |
|
super().__init__( |
|
StratifiedKFold, |
|
n_repeats=n_repeats, |
|
random_state=random_state, |
|
n_splits=n_splits, |
|
) |
|
|
|
def split(self, X, y, groups=None): |
|
"""Generate indices to split data into training and test set. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features) |
|
Training data, where `n_samples` is the number of samples |
|
and `n_features` is the number of features. |
|
|
|
Note that providing ``y`` is sufficient to generate the splits and |
|
hence ``np.zeros(n_samples)`` may be used as a placeholder for |
|
``X`` instead of actual training data. |
|
|
|
y : array-like of shape (n_samples,) |
|
The target variable for supervised learning problems. |
|
Stratification is done based on the y labels. |
|
|
|
groups : object |
|
Always ignored, exists for compatibility. |
|
|
|
Yields |
|
------ |
|
train : ndarray |
|
The training set indices for that split. |
|
|
|
test : ndarray |
|
The testing set indices for that split. |
|
|
|
Notes |
|
----- |
|
Randomized CV splitters may return different results for each call of |
|
split. You can make the results identical by setting `random_state` |
|
to an integer. |
|
""" |
|
y = check_array(y, input_name="y", ensure_2d=False, dtype=None) |
|
return super().split(X, y, groups=groups) |
|
|
|
|
|
class BaseShuffleSplit(_MetadataRequester, metaclass=ABCMeta): |
|
"""Base class for *ShuffleSplit. |
|
|
|
Parameters |
|
---------- |
|
n_splits : int, default=10 |
|
Number of re-shuffling & splitting iterations. |
|
|
|
test_size : float or int, default=None |
|
If float, should be between 0.0 and 1.0 and represent the proportion |
|
of the dataset to include in the test split. If int, represents the |
|
absolute number of test samples. If None, the value is set to the |
|
complement of the train size. If ``train_size`` is also None, it will |
|
be set to 0.1. |
|
|
|
train_size : float or int, default=None |
|
If float, should be between 0.0 and 1.0 and represent the |
|
proportion of the dataset to include in the train split. If |
|
int, represents the absolute number of train samples. If None, |
|
the value is automatically set to the complement of the test size. |
|
|
|
random_state : int, RandomState instance or None, default=None |
|
Controls the randomness of the training and testing indices produced. |
|
Pass an int for reproducible output across multiple function calls. |
|
See :term:`Glossary <random_state>`. |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
__metadata_request__split = {"groups": metadata_routing.UNUSED} |
|
|
|
def __init__( |
|
self, n_splits=10, *, test_size=None, train_size=None, random_state=None |
|
): |
|
self.n_splits = n_splits |
|
self.test_size = test_size |
|
self.train_size = train_size |
|
self.random_state = random_state |
|
self._default_test_size = 0.1 |
|
|
|
def split(self, X, y=None, groups=None): |
|
"""Generate indices to split data into training and test set. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features) |
|
Training data, where `n_samples` is the number of samples |
|
and `n_features` is the number of features. |
|
|
|
y : array-like of shape (n_samples,) |
|
The target variable for supervised learning problems. |
|
|
|
groups : array-like of shape (n_samples,), default=None |
|
Group labels for the samples used while splitting the dataset into |
|
train/test set. |
|
|
|
Yields |
|
------ |
|
train : ndarray |
|
The training set indices for that split. |
|
|
|
test : ndarray |
|
The testing set indices for that split. |
|
|
|
Notes |
|
----- |
|
Randomized CV splitters may return different results for each call of |
|
split. You can make the results identical by setting `random_state` |
|
to an integer. |
|
""" |
|
X, y, groups = indexable(X, y, groups) |
|
for train, test in self._iter_indices(X, y, groups): |
|
yield train, test |
|
|
|
def _iter_indices(self, X, y=None, groups=None): |
|
"""Generate (train, test) indices""" |
|
n_samples = _num_samples(X) |
|
n_train, n_test = _validate_shuffle_split( |
|
n_samples, |
|
self.test_size, |
|
self.train_size, |
|
default_test_size=self._default_test_size, |
|
) |
|
|
|
rng = check_random_state(self.random_state) |
|
for i in range(self.n_splits): |
|
|
|
permutation = rng.permutation(n_samples) |
|
ind_test = permutation[:n_test] |
|
ind_train = permutation[n_test : (n_test + n_train)] |
|
yield ind_train, ind_test |
|
|
|
def get_n_splits(self, X=None, y=None, groups=None): |
|
"""Returns the number of splitting iterations in the cross-validator. |
|
|
|
Parameters |
|
---------- |
|
X : object |
|
Always ignored, exists for compatibility. |
|
|
|
y : object |
|
Always ignored, exists for compatibility. |
|
|
|
groups : object |
|
Always ignored, exists for compatibility. |
|
|
|
Returns |
|
------- |
|
n_splits : int |
|
Returns the number of splitting iterations in the cross-validator. |
|
""" |
|
return self.n_splits |
|
|
|
def __repr__(self): |
|
return _build_repr(self) |
|
|
|
|
|
class ShuffleSplit(_UnsupportedGroupCVMixin, BaseShuffleSplit): |
|
"""Random permutation cross-validator. |
|
|
|
Yields indices to split data into training and test sets. |
|
|
|
Note: contrary to other cross-validation strategies, random splits |
|
do not guarantee that test sets across all folds will be mutually exclusive, |
|
and might include overlapping samples. However, this is still very likely for |
|
sizeable datasets. |
|
|
|
Read more in the :ref:`User Guide <ShuffleSplit>`. |
|
|
|
For visualisation of cross-validation behaviour and |
|
comparison between common scikit-learn split methods |
|
refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` |
|
|
|
Parameters |
|
---------- |
|
n_splits : int, default=10 |
|
Number of re-shuffling & splitting iterations. |
|
|
|
test_size : float or int, default=None |
|
If float, should be between 0.0 and 1.0 and represent the proportion |
|
of the dataset to include in the test split. If int, represents the |
|
absolute number of test samples. If None, the value is set to the |
|
complement of the train size. If ``train_size`` is also None, it will |
|
be set to 0.1. |
|
|
|
train_size : float or int, default=None |
|
If float, should be between 0.0 and 1.0 and represent the |
|
proportion of the dataset to include in the train split. If |
|
int, represents the absolute number of train samples. If None, |
|
the value is automatically set to the complement of the test size. |
|
|
|
random_state : int, RandomState instance or None, default=None |
|
Controls the randomness of the training and testing indices produced. |
|
Pass an int for reproducible output across multiple function calls. |
|
See :term:`Glossary <random_state>`. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.model_selection import ShuffleSplit |
|
>>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [3, 4], [5, 6]]) |
|
>>> y = np.array([1, 2, 1, 2, 1, 2]) |
|
>>> rs = ShuffleSplit(n_splits=5, test_size=.25, random_state=0) |
|
>>> rs.get_n_splits(X) |
|
5 |
|
>>> print(rs) |
|
ShuffleSplit(n_splits=5, random_state=0, test_size=0.25, train_size=None) |
|
>>> for i, (train_index, test_index) in enumerate(rs.split(X)): |
|
... print(f"Fold {i}:") |
|
... print(f" Train: index={train_index}") |
|
... print(f" Test: index={test_index}") |
|
Fold 0: |
|
Train: index=[1 3 0 4] |
|
Test: index=[5 2] |
|
Fold 1: |
|
Train: index=[4 0 2 5] |
|
Test: index=[1 3] |
|
Fold 2: |
|
Train: index=[1 2 4 0] |
|
Test: index=[3 5] |
|
Fold 3: |
|
Train: index=[3 4 1 0] |
|
Test: index=[5 2] |
|
Fold 4: |
|
Train: index=[3 5 1 0] |
|
Test: index=[2 4] |
|
>>> # Specify train and test size |
|
>>> rs = ShuffleSplit(n_splits=5, train_size=0.5, test_size=.25, |
|
... random_state=0) |
|
>>> for i, (train_index, test_index) in enumerate(rs.split(X)): |
|
... print(f"Fold {i}:") |
|
... print(f" Train: index={train_index}") |
|
... print(f" Test: index={test_index}") |
|
Fold 0: |
|
Train: index=[1 3 0] |
|
Test: index=[5 2] |
|
Fold 1: |
|
Train: index=[4 0 2] |
|
Test: index=[1 3] |
|
Fold 2: |
|
Train: index=[1 2 4] |
|
Test: index=[3 5] |
|
Fold 3: |
|
Train: index=[3 4 1] |
|
Test: index=[5 2] |
|
Fold 4: |
|
Train: index=[3 5 1] |
|
Test: index=[2 4] |
|
""" |
|
|
|
def __init__( |
|
self, n_splits=10, *, test_size=None, train_size=None, random_state=None |
|
): |
|
super().__init__( |
|
n_splits=n_splits, |
|
test_size=test_size, |
|
train_size=train_size, |
|
random_state=random_state, |
|
) |
|
self._default_test_size = 0.1 |
|
|
|
|
|
class GroupShuffleSplit(GroupsConsumerMixin, BaseShuffleSplit): |
|
"""Shuffle-Group(s)-Out cross-validation iterator. |
|
|
|
Provides randomized train/test indices to split data according to a |
|
third-party provided group. This group information can be used to encode |
|
arbitrary domain specific stratifications of the samples as integers. |
|
|
|
For instance the groups could be the year of collection of the samples |
|
and thus allow for cross-validation against time-based splits. |
|
|
|
The difference between :class:`LeavePGroupsOut` and ``GroupShuffleSplit`` is that |
|
the former generates splits using all subsets of size ``p`` unique groups, |
|
whereas ``GroupShuffleSplit`` generates a user-determined number of random |
|
test splits, each with a user-determined fraction of unique groups. |
|
|
|
For example, a less computationally intensive alternative to |
|
``LeavePGroupsOut(p=10)`` would be |
|
``GroupShuffleSplit(test_size=10, n_splits=100)``. |
|
|
|
Contrary to other cross-validation strategies, the random splits |
|
do not guarantee that test sets across all folds will be mutually exclusive, |
|
and might include overlapping samples. However, this is still very likely for |
|
sizeable datasets. |
|
|
|
Note: The parameters ``test_size`` and ``train_size`` refer to groups, and |
|
not to samples as in :class:`ShuffleSplit`. |
|
|
|
Read more in the :ref:`User Guide <group_shuffle_split>`. |
|
|
|
For visualisation of cross-validation behaviour and |
|
comparison between common scikit-learn split methods |
|
refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` |
|
|
|
Parameters |
|
---------- |
|
n_splits : int, default=5 |
|
Number of re-shuffling & splitting iterations. |
|
|
|
test_size : float, int, default=None |
|
If float, should be between 0.0 and 1.0 and represent the proportion |
|
of groups to include in the test split (rounded up). If int, |
|
represents the absolute number of test groups. If None, the value is |
|
set to the complement of the train size. If ``train_size`` is also None, |
|
it will be set to 0.2. |
|
|
|
train_size : float or int, default=None |
|
If float, should be between 0.0 and 1.0 and represent the |
|
proportion of the groups to include in the train split. If |
|
int, represents the absolute number of train groups. If None, |
|
the value is automatically set to the complement of the test size. |
|
|
|
random_state : int, RandomState instance or None, default=None |
|
Controls the randomness of the training and testing indices produced. |
|
Pass an int for reproducible output across multiple function calls. |
|
See :term:`Glossary <random_state>`. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.model_selection import GroupShuffleSplit |
|
>>> X = np.ones(shape=(8, 2)) |
|
>>> y = np.ones(shape=(8, 1)) |
|
>>> groups = np.array([1, 1, 2, 2, 2, 3, 3, 3]) |
|
>>> print(groups.shape) |
|
(8,) |
|
>>> gss = GroupShuffleSplit(n_splits=2, train_size=.7, random_state=42) |
|
>>> gss.get_n_splits() |
|
2 |
|
>>> print(gss) |
|
GroupShuffleSplit(n_splits=2, random_state=42, test_size=None, train_size=0.7) |
|
>>> for i, (train_index, test_index) in enumerate(gss.split(X, y, groups)): |
|
... print(f"Fold {i}:") |
|
... print(f" Train: index={train_index}, group={groups[train_index]}") |
|
... print(f" Test: index={test_index}, group={groups[test_index]}") |
|
Fold 0: |
|
Train: index=[2 3 4 5 6 7], group=[2 2 2 3 3 3] |
|
Test: index=[0 1], group=[1 1] |
|
Fold 1: |
|
Train: index=[0 1 5 6 7], group=[1 1 3 3 3] |
|
Test: index=[2 3 4], group=[2 2 2] |
|
|
|
See Also |
|
-------- |
|
ShuffleSplit : Shuffles samples to create independent test/train sets. |
|
|
|
LeavePGroupsOut : Train set leaves out all possible subsets of `p` groups. |
|
""" |
|
|
|
def __init__( |
|
self, n_splits=5, *, test_size=None, train_size=None, random_state=None |
|
): |
|
super().__init__( |
|
n_splits=n_splits, |
|
test_size=test_size, |
|
train_size=train_size, |
|
random_state=random_state, |
|
) |
|
self._default_test_size = 0.2 |
|
|
|
def _iter_indices(self, X, y, groups): |
|
if groups is None: |
|
raise ValueError("The 'groups' parameter should not be None.") |
|
groups = check_array(groups, input_name="groups", ensure_2d=False, dtype=None) |
|
classes, group_indices = np.unique(groups, return_inverse=True) |
|
for group_train, group_test in super()._iter_indices(X=classes): |
|
|
|
|
|
|
|
train = np.flatnonzero(np.isin(group_indices, group_train)) |
|
test = np.flatnonzero(np.isin(group_indices, group_test)) |
|
|
|
yield train, test |
|
|
|
def split(self, X, y=None, groups=None): |
|
"""Generate indices to split data into training and test set. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features) |
|
Training data, where `n_samples` is the number of samples |
|
and `n_features` is the number of features. |
|
|
|
y : array-like of shape (n_samples,), default=None |
|
The target variable for supervised learning problems. |
|
|
|
groups : array-like of shape (n_samples,) |
|
Group labels for the samples used while splitting the dataset into |
|
train/test set. |
|
|
|
Yields |
|
------ |
|
train : ndarray |
|
The training set indices for that split. |
|
|
|
test : ndarray |
|
The testing set indices for that split. |
|
|
|
Notes |
|
----- |
|
Randomized CV splitters may return different results for each call of |
|
split. You can make the results identical by setting `random_state` |
|
to an integer. |
|
""" |
|
return super().split(X, y, groups) |
|
|
|
|
|
class StratifiedShuffleSplit(BaseShuffleSplit): |
|
"""Stratified ShuffleSplit cross-validator. |
|
|
|
Provides train/test indices to split data in train/test sets. |
|
|
|
This cross-validation object is a merge of :class:`StratifiedKFold` and |
|
:class:`ShuffleSplit`, which returns stratified randomized folds. The folds |
|
are made by preserving the percentage of samples for each class. |
|
|
|
Note: like the :class:`ShuffleSplit` strategy, stratified random splits |
|
do not guarantee that test sets across all folds will be mutually exclusive, |
|
and might include overlapping samples. However, this is still very likely for |
|
sizeable datasets. |
|
|
|
Read more in the :ref:`User Guide <stratified_shuffle_split>`. |
|
|
|
For visualisation of cross-validation behaviour and |
|
comparison between common scikit-learn split methods |
|
refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` |
|
|
|
Parameters |
|
---------- |
|
n_splits : int, default=10 |
|
Number of re-shuffling & splitting iterations. |
|
|
|
test_size : float or int, default=None |
|
If float, should be between 0.0 and 1.0 and represent the proportion |
|
of the dataset to include in the test split. If int, represents the |
|
absolute number of test samples. If None, the value is set to the |
|
complement of the train size. If ``train_size`` is also None, it will |
|
be set to 0.1. |
|
|
|
train_size : float or int, default=None |
|
If float, should be between 0.0 and 1.0 and represent the |
|
proportion of the dataset to include in the train split. If |
|
int, represents the absolute number of train samples. If None, |
|
the value is automatically set to the complement of the test size. |
|
|
|
random_state : int, RandomState instance or None, default=None |
|
Controls the randomness of the training and testing indices produced. |
|
Pass an int for reproducible output across multiple function calls. |
|
See :term:`Glossary <random_state>`. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.model_selection import StratifiedShuffleSplit |
|
>>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]]) |
|
>>> y = np.array([0, 0, 0, 1, 1, 1]) |
|
>>> sss = StratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=0) |
|
>>> sss.get_n_splits(X, y) |
|
5 |
|
>>> print(sss) |
|
StratifiedShuffleSplit(n_splits=5, random_state=0, ...) |
|
>>> for i, (train_index, test_index) in enumerate(sss.split(X, y)): |
|
... print(f"Fold {i}:") |
|
... print(f" Train: index={train_index}") |
|
... print(f" Test: index={test_index}") |
|
Fold 0: |
|
Train: index=[5 2 3] |
|
Test: index=[4 1 0] |
|
Fold 1: |
|
Train: index=[5 1 4] |
|
Test: index=[0 2 3] |
|
Fold 2: |
|
Train: index=[5 0 2] |
|
Test: index=[4 3 1] |
|
Fold 3: |
|
Train: index=[4 1 0] |
|
Test: index=[2 3 5] |
|
Fold 4: |
|
Train: index=[0 5 1] |
|
Test: index=[3 4 2] |
|
""" |
|
|
|
def __init__( |
|
self, n_splits=10, *, test_size=None, train_size=None, random_state=None |
|
): |
|
super().__init__( |
|
n_splits=n_splits, |
|
test_size=test_size, |
|
train_size=train_size, |
|
random_state=random_state, |
|
) |
|
self._default_test_size = 0.1 |
|
|
|
def _iter_indices(self, X, y, groups=None): |
|
n_samples = _num_samples(X) |
|
y = check_array(y, input_name="y", ensure_2d=False, dtype=None) |
|
n_train, n_test = _validate_shuffle_split( |
|
n_samples, |
|
self.test_size, |
|
self.train_size, |
|
default_test_size=self._default_test_size, |
|
) |
|
|
|
|
|
|
|
|
|
xp, _ = get_namespace(y) |
|
y = _convert_to_numpy(y, xp=xp) |
|
|
|
if y.ndim == 2: |
|
|
|
|
|
y = np.array([" ".join(row.astype("str")) for row in y]) |
|
|
|
classes, y_indices = np.unique(y, return_inverse=True) |
|
n_classes = classes.shape[0] |
|
|
|
class_counts = np.bincount(y_indices) |
|
if np.min(class_counts) < 2: |
|
raise ValueError( |
|
"The least populated class in y has only 1" |
|
" member, which is too few. The minimum" |
|
" number of groups for any class cannot" |
|
" be less than 2." |
|
) |
|
|
|
if n_train < n_classes: |
|
raise ValueError( |
|
"The train_size = %d should be greater or " |
|
"equal to the number of classes = %d" % (n_train, n_classes) |
|
) |
|
if n_test < n_classes: |
|
raise ValueError( |
|
"The test_size = %d should be greater or " |
|
"equal to the number of classes = %d" % (n_test, n_classes) |
|
) |
|
|
|
|
|
|
|
class_indices = np.split( |
|
np.argsort(y_indices, kind="mergesort"), np.cumsum(class_counts)[:-1] |
|
) |
|
|
|
rng = check_random_state(self.random_state) |
|
|
|
for _ in range(self.n_splits): |
|
|
|
|
|
n_i = _approximate_mode(class_counts, n_train, rng) |
|
class_counts_remaining = class_counts - n_i |
|
t_i = _approximate_mode(class_counts_remaining, n_test, rng) |
|
|
|
train = [] |
|
test = [] |
|
|
|
for i in range(n_classes): |
|
permutation = rng.permutation(class_counts[i]) |
|
perm_indices_class_i = class_indices[i].take(permutation, mode="clip") |
|
|
|
train.extend(perm_indices_class_i[: n_i[i]]) |
|
test.extend(perm_indices_class_i[n_i[i] : n_i[i] + t_i[i]]) |
|
|
|
train = rng.permutation(train) |
|
test = rng.permutation(test) |
|
|
|
yield train, test |
|
|
|
def split(self, X, y, groups=None): |
|
"""Generate indices to split data into training and test set. |
|
|
|
Parameters |
|
---------- |
|
X : array-like of shape (n_samples, n_features) |
|
Training data, where `n_samples` is the number of samples |
|
and `n_features` is the number of features. |
|
|
|
Note that providing ``y`` is sufficient to generate the splits and |
|
hence ``np.zeros(n_samples)`` may be used as a placeholder for |
|
``X`` instead of actual training data. |
|
|
|
y : array-like of shape (n_samples,) or (n_samples, n_labels) |
|
The target variable for supervised learning problems. |
|
Stratification is done based on the y labels. |
|
|
|
groups : object |
|
Always ignored, exists for compatibility. |
|
|
|
Yields |
|
------ |
|
train : ndarray |
|
The training set indices for that split. |
|
|
|
test : ndarray |
|
The testing set indices for that split. |
|
|
|
Notes |
|
----- |
|
Randomized CV splitters may return different results for each call of |
|
split. You can make the results identical by setting `random_state` |
|
to an integer. |
|
""" |
|
if groups is not None: |
|
warnings.warn( |
|
f"The groups parameter is ignored by {self.__class__.__name__}", |
|
UserWarning, |
|
) |
|
y = check_array(y, input_name="y", ensure_2d=False, dtype=None) |
|
return super().split(X, y, groups) |
|
|
|
|
|
def _validate_shuffle_split(n_samples, test_size, train_size, default_test_size=None): |
|
""" |
|
Validation helper to check if the train/test sizes are meaningful w.r.t. the |
|
size of the data (n_samples). |
|
""" |
|
if test_size is None and train_size is None: |
|
test_size = default_test_size |
|
|
|
test_size_type = np.asarray(test_size).dtype.kind |
|
train_size_type = np.asarray(train_size).dtype.kind |
|
|
|
if ( |
|
test_size_type == "i" |
|
and (test_size >= n_samples or test_size <= 0) |
|
or test_size_type == "f" |
|
and (test_size <= 0 or test_size >= 1) |
|
): |
|
raise ValueError( |
|
"test_size={0} should be either positive and smaller" |
|
" than the number of samples {1} or a float in the " |
|
"(0, 1) range".format(test_size, n_samples) |
|
) |
|
|
|
if ( |
|
train_size_type == "i" |
|
and (train_size >= n_samples or train_size <= 0) |
|
or train_size_type == "f" |
|
and (train_size <= 0 or train_size >= 1) |
|
): |
|
raise ValueError( |
|
"train_size={0} should be either positive and smaller" |
|
" than the number of samples {1} or a float in the " |
|
"(0, 1) range".format(train_size, n_samples) |
|
) |
|
|
|
if train_size is not None and train_size_type not in ("i", "f"): |
|
raise ValueError("Invalid value for train_size: {}".format(train_size)) |
|
if test_size is not None and test_size_type not in ("i", "f"): |
|
raise ValueError("Invalid value for test_size: {}".format(test_size)) |
|
|
|
if train_size_type == "f" and test_size_type == "f" and train_size + test_size > 1: |
|
raise ValueError( |
|
"The sum of test_size and train_size = {}, should be in the (0, 1)" |
|
" range. Reduce test_size and/or train_size.".format(train_size + test_size) |
|
) |
|
|
|
if test_size_type == "f": |
|
n_test = ceil(test_size * n_samples) |
|
elif test_size_type == "i": |
|
n_test = float(test_size) |
|
|
|
if train_size_type == "f": |
|
n_train = floor(train_size * n_samples) |
|
elif train_size_type == "i": |
|
n_train = float(train_size) |
|
|
|
if train_size is None: |
|
n_train = n_samples - n_test |
|
elif test_size is None: |
|
n_test = n_samples - n_train |
|
|
|
if n_train + n_test > n_samples: |
|
raise ValueError( |
|
"The sum of train_size and test_size = %d, " |
|
"should be smaller than the number of " |
|
"samples %d. Reduce test_size and/or " |
|
"train_size." % (n_train + n_test, n_samples) |
|
) |
|
|
|
n_train, n_test = int(n_train), int(n_test) |
|
|
|
if n_train == 0: |
|
raise ValueError( |
|
"With n_samples={}, test_size={} and train_size={}, the " |
|
"resulting train set will be empty. Adjust any of the " |
|
"aforementioned parameters.".format(n_samples, test_size, train_size) |
|
) |
|
|
|
return n_train, n_test |
|
|
|
|
|
class PredefinedSplit(BaseCrossValidator): |
|
"""Predefined split cross-validator. |
|
|
|
Provides train/test indices to split data into train/test sets using a |
|
predefined scheme specified by the user with the ``test_fold`` parameter. |
|
|
|
Read more in the :ref:`User Guide <predefined_split>`. |
|
|
|
.. versionadded:: 0.16 |
|
|
|
Parameters |
|
---------- |
|
test_fold : array-like of shape (n_samples,) |
|
The entry ``test_fold[i]`` represents the index of the test set that |
|
sample ``i`` belongs to. It is possible to exclude sample ``i`` from |
|
any test set (i.e. include sample ``i`` in every training set) by |
|
setting ``test_fold[i]`` equal to -1. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.model_selection import PredefinedSplit |
|
>>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) |
|
>>> y = np.array([0, 0, 1, 1]) |
|
>>> test_fold = [0, 1, -1, 1] |
|
>>> ps = PredefinedSplit(test_fold) |
|
>>> ps.get_n_splits() |
|
2 |
|
>>> print(ps) |
|
PredefinedSplit(test_fold=array([ 0, 1, -1, 1])) |
|
>>> for i, (train_index, test_index) in enumerate(ps.split()): |
|
... print(f"Fold {i}:") |
|
... print(f" Train: index={train_index}") |
|
... print(f" Test: index={test_index}") |
|
Fold 0: |
|
Train: index=[1 2 3] |
|
Test: index=[0] |
|
Fold 1: |
|
Train: index=[0 2] |
|
Test: index=[1 3] |
|
""" |
|
|
|
def __init__(self, test_fold): |
|
self.test_fold = np.array(test_fold, dtype=int) |
|
self.test_fold = column_or_1d(self.test_fold) |
|
self.unique_folds = np.unique(self.test_fold) |
|
self.unique_folds = self.unique_folds[self.unique_folds != -1] |
|
|
|
def split(self, X=None, y=None, groups=None): |
|
"""Generate indices to split data into training and test set. |
|
|
|
Parameters |
|
---------- |
|
X : object |
|
Always ignored, exists for compatibility. |
|
|
|
y : object |
|
Always ignored, exists for compatibility. |
|
|
|
groups : object |
|
Always ignored, exists for compatibility. |
|
|
|
Yields |
|
------ |
|
train : ndarray |
|
The training set indices for that split. |
|
|
|
test : ndarray |
|
The testing set indices for that split. |
|
""" |
|
if groups is not None: |
|
warnings.warn( |
|
f"The groups parameter is ignored by {self.__class__.__name__}", |
|
UserWarning, |
|
) |
|
return self._split() |
|
|
|
def _split(self): |
|
"""Generate indices to split data into training and test set. |
|
|
|
Yields |
|
------ |
|
train : ndarray |
|
The training set indices for that split. |
|
|
|
test : ndarray |
|
The testing set indices for that split. |
|
""" |
|
ind = np.arange(len(self.test_fold)) |
|
for test_index in self._iter_test_masks(): |
|
train_index = ind[np.logical_not(test_index)] |
|
test_index = ind[test_index] |
|
yield train_index, test_index |
|
|
|
def _iter_test_masks(self): |
|
"""Generates boolean masks corresponding to test sets.""" |
|
for f in self.unique_folds: |
|
test_index = np.where(self.test_fold == f)[0] |
|
test_mask = np.zeros(len(self.test_fold), dtype=bool) |
|
test_mask[test_index] = True |
|
yield test_mask |
|
|
|
def get_n_splits(self, X=None, y=None, groups=None): |
|
"""Returns the number of splitting iterations in the cross-validator. |
|
|
|
Parameters |
|
---------- |
|
X : object |
|
Always ignored, exists for compatibility. |
|
|
|
y : object |
|
Always ignored, exists for compatibility. |
|
|
|
groups : object |
|
Always ignored, exists for compatibility. |
|
|
|
Returns |
|
------- |
|
n_splits : int |
|
Returns the number of splitting iterations in the cross-validator. |
|
""" |
|
return len(self.unique_folds) |
|
|
|
|
|
class _CVIterableWrapper(BaseCrossValidator): |
|
"""Wrapper class for old style cv objects and iterables.""" |
|
|
|
def __init__(self, cv): |
|
self.cv = list(cv) |
|
|
|
def get_n_splits(self, X=None, y=None, groups=None): |
|
"""Returns the number of splitting iterations in the cross-validator. |
|
|
|
Parameters |
|
---------- |
|
X : object |
|
Always ignored, exists for compatibility. |
|
|
|
y : object |
|
Always ignored, exists for compatibility. |
|
|
|
groups : object |
|
Always ignored, exists for compatibility. |
|
|
|
Returns |
|
------- |
|
n_splits : int |
|
Returns the number of splitting iterations in the cross-validator. |
|
""" |
|
return len(self.cv) |
|
|
|
def split(self, X=None, y=None, groups=None): |
|
"""Generate indices to split data into training and test set. |
|
|
|
Parameters |
|
---------- |
|
X : object |
|
Always ignored, exists for compatibility. |
|
|
|
y : object |
|
Always ignored, exists for compatibility. |
|
|
|
groups : object |
|
Always ignored, exists for compatibility. |
|
|
|
Yields |
|
------ |
|
train : ndarray |
|
The training set indices for that split. |
|
|
|
test : ndarray |
|
The testing set indices for that split. |
|
""" |
|
for train, test in self.cv: |
|
yield train, test |
|
|
|
|
|
def check_cv(cv=5, y=None, *, classifier=False): |
|
"""Input checker utility for building a cross-validator. |
|
|
|
Parameters |
|
---------- |
|
cv : int, cross-validation generator, iterable or None, default=5 |
|
Determines the cross-validation splitting strategy. |
|
Possible inputs for cv are: |
|
- None, to use the default 5-fold cross validation, |
|
- integer, to specify the number of folds. |
|
- :term:`CV splitter`, |
|
- An iterable that generates (train, test) splits as arrays of indices. |
|
|
|
For integer/None inputs, if classifier is True and ``y`` is either |
|
binary or multiclass, :class:`StratifiedKFold` is used. In all other |
|
cases, :class:`KFold` is used. |
|
|
|
Refer :ref:`User Guide <cross_validation>` for the various |
|
cross-validation strategies that can be used here. |
|
|
|
.. versionchanged:: 0.22 |
|
``cv`` default value changed from 3-fold to 5-fold. |
|
|
|
y : array-like, default=None |
|
The target variable for supervised learning problems. |
|
|
|
classifier : bool, default=False |
|
Whether the task is a classification task, in which case |
|
stratified KFold will be used. |
|
|
|
Returns |
|
------- |
|
checked_cv : a cross-validator instance. |
|
The return value is a cross-validator which generates the train/test |
|
splits via the ``split`` method. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.model_selection import check_cv |
|
>>> check_cv(cv=5, y=None, classifier=False) |
|
KFold(...) |
|
>>> check_cv(cv=5, y=[1, 1, 0, 0, 0, 0], classifier=True) |
|
StratifiedKFold(...) |
|
""" |
|
cv = 5 if cv is None else cv |
|
if isinstance(cv, numbers.Integral): |
|
if ( |
|
classifier |
|
and (y is not None) |
|
and (type_of_target(y, input_name="y") in ("binary", "multiclass")) |
|
): |
|
return StratifiedKFold(cv) |
|
else: |
|
return KFold(cv) |
|
|
|
if not hasattr(cv, "split") or isinstance(cv, str): |
|
if not isinstance(cv, Iterable) or isinstance(cv, str): |
|
raise ValueError( |
|
"Expected cv as an integer, cross-validation " |
|
"object (from sklearn.model_selection) " |
|
"or an iterable. Got %s." % cv |
|
) |
|
return _CVIterableWrapper(cv) |
|
|
|
return cv |
|
|
|
|
|
@validate_params( |
|
{ |
|
"test_size": [ |
|
Interval(RealNotInt, 0, 1, closed="neither"), |
|
Interval(numbers.Integral, 1, None, closed="left"), |
|
None, |
|
], |
|
"train_size": [ |
|
Interval(RealNotInt, 0, 1, closed="neither"), |
|
Interval(numbers.Integral, 1, None, closed="left"), |
|
None, |
|
], |
|
"random_state": ["random_state"], |
|
"shuffle": ["boolean"], |
|
"stratify": ["array-like", None], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def train_test_split( |
|
*arrays, |
|
test_size=None, |
|
train_size=None, |
|
random_state=None, |
|
shuffle=True, |
|
stratify=None, |
|
): |
|
"""Split arrays or matrices into random train and test subsets. |
|
|
|
Quick utility that wraps input validation, |
|
``next(ShuffleSplit().split(X, y))``, and application to input data |
|
into a single call for splitting (and optionally subsampling) data into a |
|
one-liner. |
|
|
|
Read more in the :ref:`User Guide <cross_validation>`. |
|
|
|
Parameters |
|
---------- |
|
*arrays : sequence of indexables with same length / shape[0] |
|
Allowed inputs are lists, numpy arrays, scipy-sparse |
|
matrices or pandas dataframes. |
|
|
|
test_size : float or int, default=None |
|
If float, should be between 0.0 and 1.0 and represent the proportion |
|
of the dataset to include in the test split. If int, represents the |
|
absolute number of test samples. If None, the value is set to the |
|
complement of the train size. If ``train_size`` is also None, it will |
|
be set to 0.25. |
|
|
|
train_size : float or int, default=None |
|
If float, should be between 0.0 and 1.0 and represent the |
|
proportion of the dataset to include in the train split. If |
|
int, represents the absolute number of train samples. If None, |
|
the value is automatically set to the complement of the test size. |
|
|
|
random_state : int, RandomState instance or None, default=None |
|
Controls the shuffling applied to the data before applying the split. |
|
Pass an int for reproducible output across multiple function calls. |
|
See :term:`Glossary <random_state>`. |
|
|
|
shuffle : bool, default=True |
|
Whether or not to shuffle the data before splitting. If shuffle=False |
|
then stratify must be None. |
|
|
|
stratify : array-like, default=None |
|
If not None, data is split in a stratified fashion, using this as |
|
the class labels. |
|
Read more in the :ref:`User Guide <stratification>`. |
|
|
|
Returns |
|
------- |
|
splitting : list, length=2 * len(arrays) |
|
List containing train-test split of inputs. |
|
|
|
.. versionadded:: 0.16 |
|
If the input is sparse, the output will be a |
|
``scipy.sparse.csr_matrix``. Else, output type is the same as the |
|
input type. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.model_selection import train_test_split |
|
>>> X, y = np.arange(10).reshape((5, 2)), range(5) |
|
>>> X |
|
array([[0, 1], |
|
[2, 3], |
|
[4, 5], |
|
[6, 7], |
|
[8, 9]]) |
|
>>> list(y) |
|
[0, 1, 2, 3, 4] |
|
|
|
>>> X_train, X_test, y_train, y_test = train_test_split( |
|
... X, y, test_size=0.33, random_state=42) |
|
... |
|
>>> X_train |
|
array([[4, 5], |
|
[0, 1], |
|
[6, 7]]) |
|
>>> y_train |
|
[2, 0, 3] |
|
>>> X_test |
|
array([[2, 3], |
|
[8, 9]]) |
|
>>> y_test |
|
[1, 4] |
|
|
|
>>> train_test_split(y, shuffle=False) |
|
[[0, 1, 2], [3, 4]] |
|
""" |
|
n_arrays = len(arrays) |
|
if n_arrays == 0: |
|
raise ValueError("At least one array required as input") |
|
|
|
arrays = indexable(*arrays) |
|
|
|
n_samples = _num_samples(arrays[0]) |
|
n_train, n_test = _validate_shuffle_split( |
|
n_samples, test_size, train_size, default_test_size=0.25 |
|
) |
|
|
|
if shuffle is False: |
|
if stratify is not None: |
|
raise ValueError( |
|
"Stratified train/test split is not implemented for shuffle=False" |
|
) |
|
|
|
train = np.arange(n_train) |
|
test = np.arange(n_train, n_train + n_test) |
|
|
|
else: |
|
if stratify is not None: |
|
CVClass = StratifiedShuffleSplit |
|
else: |
|
CVClass = ShuffleSplit |
|
|
|
cv = CVClass(test_size=n_test, train_size=n_train, random_state=random_state) |
|
|
|
train, test = next(cv.split(X=arrays[0], y=stratify)) |
|
|
|
train, test = ensure_common_namespace_device(arrays[0], train, test) |
|
|
|
return list( |
|
chain.from_iterable( |
|
(_safe_indexing(a, train), _safe_indexing(a, test)) for a in arrays |
|
) |
|
) |
|
|
|
|
|
|
|
|
|
|
|
setattr(train_test_split, "__test__", False) |
|
|
|
|
|
def _pprint(params, offset=0, printer=repr): |
|
"""Pretty print the dictionary 'params' |
|
|
|
Parameters |
|
---------- |
|
params : dict |
|
The dictionary to pretty print |
|
|
|
offset : int, default=0 |
|
The offset in characters to add at the begin of each line. |
|
|
|
printer : callable, default=repr |
|
The function to convert entries to strings, typically |
|
the builtin str or repr |
|
|
|
""" |
|
|
|
options = np.get_printoptions() |
|
np.set_printoptions(precision=5, threshold=64, edgeitems=2) |
|
params_list = list() |
|
this_line_length = offset |
|
line_sep = ",\n" + (1 + offset // 2) * " " |
|
for i, (k, v) in enumerate(sorted(params.items())): |
|
if isinstance(v, float): |
|
|
|
|
|
|
|
this_repr = "%s=%s" % (k, str(v)) |
|
else: |
|
|
|
this_repr = "%s=%s" % (k, printer(v)) |
|
if len(this_repr) > 500: |
|
this_repr = this_repr[:300] + "..." + this_repr[-100:] |
|
if i > 0: |
|
if this_line_length + len(this_repr) >= 75 or "\n" in this_repr: |
|
params_list.append(line_sep) |
|
this_line_length = len(line_sep) |
|
else: |
|
params_list.append(", ") |
|
this_line_length += 2 |
|
params_list.append(this_repr) |
|
this_line_length += len(this_repr) |
|
|
|
np.set_printoptions(**options) |
|
lines = "".join(params_list) |
|
|
|
lines = "\n".join(l.rstrip(" ") for l in lines.split("\n")) |
|
return lines |
|
|
|
|
|
def _build_repr(self): |
|
|
|
cls = self.__class__ |
|
init = getattr(cls.__init__, "deprecated_original", cls.__init__) |
|
|
|
init_signature = signature(init) |
|
|
|
if init is object.__init__: |
|
args = [] |
|
else: |
|
args = sorted( |
|
[ |
|
p.name |
|
for p in init_signature.parameters.values() |
|
if p.name != "self" and p.kind != p.VAR_KEYWORD |
|
] |
|
) |
|
class_name = self.__class__.__name__ |
|
params = dict() |
|
for key in args: |
|
|
|
|
|
|
|
|
|
warnings.simplefilter("always", FutureWarning) |
|
try: |
|
with warnings.catch_warnings(record=True) as w: |
|
value = getattr(self, key, None) |
|
if value is None and hasattr(self, "cvargs"): |
|
value = self.cvargs.get(key, None) |
|
if len(w) and w[0].category is FutureWarning: |
|
|
|
continue |
|
finally: |
|
warnings.filters.pop(0) |
|
params[key] = value |
|
|
|
return "%s(%s)" % (class_name, _pprint(params, offset=len(class_name))) |
|
|
|
|
|
def _yields_constant_splits(cv): |
|
|
|
|
|
|
|
|
|
|
|
shuffle = getattr(cv, "shuffle", True) |
|
random_state = getattr(cv, "random_state", 0) |
|
return isinstance(random_state, numbers.Integral) or not shuffle |
|
|