File size: 40,958 Bytes

7885a28

# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause

import numpy as np
import pytest
from numpy.testing import assert_allclose

from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.calibration import (
    CalibratedClassifierCV,
    CalibrationDisplay,
    _CalibratedClassifier,
    _sigmoid_calibration,
    _SigmoidCalibration,
    calibration_curve,
)
from sklearn.datasets import load_iris, make_blobs, make_classification
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    VotingClassifier,
)
from sklearn.exceptions import NotFittedError
from sklearn.feature_extraction import DictVectorizer
from sklearn.frozen import FrozenEstimator
from sklearn.impute import SimpleImputer
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import brier_score_loss
from sklearn.model_selection import (
    KFold,
    LeaveOneOut,
    check_cv,
    cross_val_predict,
    cross_val_score,
    train_test_split,
)
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils._mocking import CheckingClassifier
from sklearn.utils._testing import (
    _convert_container,
    assert_almost_equal,
    assert_array_almost_equal,
    assert_array_equal,
    ignore_warnings,
)
from sklearn.utils.extmath import softmax
from sklearn.utils.fixes import CSR_CONTAINERS

N_SAMPLES = 200


@pytest.fixture(scope="module")
def data():
    X, y = make_classification(n_samples=N_SAMPLES, n_features=6, random_state=42)
    return X, y


@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
@pytest.mark.parametrize("method", ["sigmoid", "isotonic"])
@pytest.mark.parametrize("ensemble", [True, False])
def test_calibration(data, method, csr_container, ensemble):
    # Test calibration objects with isotonic and sigmoid
    n_samples = N_SAMPLES // 2
    X, y = data
    sample_weight = np.random.RandomState(seed=42).uniform(size=y.size)

    X -= X.min()  # MultinomialNB only allows positive X

    # split train and test
    X_train, y_train, sw_train = X[:n_samples], y[:n_samples], sample_weight[:n_samples]
    X_test, y_test = X[n_samples:], y[n_samples:]

    # Naive-Bayes
    clf = MultinomialNB().fit(X_train, y_train, sample_weight=sw_train)
    prob_pos_clf = clf.predict_proba(X_test)[:, 1]

    cal_clf = CalibratedClassifierCV(clf, cv=y.size + 1, ensemble=ensemble)
    with pytest.raises(ValueError):
        cal_clf.fit(X, y)

    # Naive Bayes with calibration
    for this_X_train, this_X_test in [
        (X_train, X_test),
        (csr_container(X_train), csr_container(X_test)),
    ]:
        cal_clf = CalibratedClassifierCV(clf, method=method, cv=5, ensemble=ensemble)
        # Note that this fit overwrites the fit on the entire training
        # set
        cal_clf.fit(this_X_train, y_train, sample_weight=sw_train)
        prob_pos_cal_clf = cal_clf.predict_proba(this_X_test)[:, 1]

        # Check that brier score has improved after calibration
        assert brier_score_loss(y_test, prob_pos_clf) > brier_score_loss(
            y_test, prob_pos_cal_clf
        )

        # Check invariance against relabeling [0, 1] -> [1, 2]
        cal_clf.fit(this_X_train, y_train + 1, sample_weight=sw_train)
        prob_pos_cal_clf_relabeled = cal_clf.predict_proba(this_X_test)[:, 1]
        assert_array_almost_equal(prob_pos_cal_clf, prob_pos_cal_clf_relabeled)

        # Check invariance against relabeling [0, 1] -> [-1, 1]
        cal_clf.fit(this_X_train, 2 * y_train - 1, sample_weight=sw_train)
        prob_pos_cal_clf_relabeled = cal_clf.predict_proba(this_X_test)[:, 1]
        assert_array_almost_equal(prob_pos_cal_clf, prob_pos_cal_clf_relabeled)

        # Check invariance against relabeling [0, 1] -> [1, 0]
        cal_clf.fit(this_X_train, (y_train + 1) % 2, sample_weight=sw_train)
        prob_pos_cal_clf_relabeled = cal_clf.predict_proba(this_X_test)[:, 1]
        if method == "sigmoid":
            assert_array_almost_equal(prob_pos_cal_clf, 1 - prob_pos_cal_clf_relabeled)
        else:
            # Isotonic calibration is not invariant against relabeling
            # but should improve in both cases
            assert brier_score_loss(y_test, prob_pos_clf) > brier_score_loss(
                (y_test + 1) % 2, prob_pos_cal_clf_relabeled
            )


def test_calibration_default_estimator(data):
    # Check estimator default is LinearSVC
    X, y = data
    calib_clf = CalibratedClassifierCV(cv=2)
    calib_clf.fit(X, y)

    base_est = calib_clf.calibrated_classifiers_[0].estimator
    assert isinstance(base_est, LinearSVC)


@pytest.mark.parametrize("ensemble", [True, False])
def test_calibration_cv_splitter(data, ensemble):
    # Check when `cv` is a CV splitter
    X, y = data

    splits = 5
    kfold = KFold(n_splits=splits)
    calib_clf = CalibratedClassifierCV(cv=kfold, ensemble=ensemble)
    assert isinstance(calib_clf.cv, KFold)
    assert calib_clf.cv.n_splits == splits

    calib_clf.fit(X, y)
    expected_n_clf = splits if ensemble else 1
    assert len(calib_clf.calibrated_classifiers_) == expected_n_clf


def test_calibration_cv_nfold(data):
    # Check error raised when number of examples per class less than nfold
    X, y = data

    kfold = KFold(n_splits=101)
    calib_clf = CalibratedClassifierCV(cv=kfold, ensemble=True)
    with pytest.raises(ValueError, match="Requesting 101-fold cross-validation"):
        calib_clf.fit(X, y)

    calib_clf = CalibratedClassifierCV(cv=LeaveOneOut(), ensemble=True)
    with pytest.raises(ValueError, match="LeaveOneOut cross-validation does"):
        calib_clf.fit(X, y)


@pytest.mark.parametrize("method", ["sigmoid", "isotonic"])
@pytest.mark.parametrize("ensemble", [True, False])
def test_sample_weight(data, method, ensemble):
    n_samples = N_SAMPLES // 2
    X, y = data

    sample_weight = np.random.RandomState(seed=42).uniform(size=len(y))
    X_train, y_train, sw_train = X[:n_samples], y[:n_samples], sample_weight[:n_samples]
    X_test = X[n_samples:]

    estimator = LinearSVC(random_state=42)
    calibrated_clf = CalibratedClassifierCV(estimator, method=method, ensemble=ensemble)
    calibrated_clf.fit(X_train, y_train, sample_weight=sw_train)
    probs_with_sw = calibrated_clf.predict_proba(X_test)

    # As the weights are used for the calibration, they should still yield
    # different predictions
    calibrated_clf.fit(X_train, y_train)
    probs_without_sw = calibrated_clf.predict_proba(X_test)

    diff = np.linalg.norm(probs_with_sw - probs_without_sw)
    assert diff > 0.1


@pytest.mark.parametrize("method", ["sigmoid", "isotonic"])
@pytest.mark.parametrize("ensemble", [True, False])
def test_parallel_execution(data, method, ensemble):
    """Test parallel calibration"""
    X, y = data
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    estimator = make_pipeline(StandardScaler(), LinearSVC(random_state=42))

    cal_clf_parallel = CalibratedClassifierCV(
        estimator, method=method, n_jobs=2, ensemble=ensemble
    )
    cal_clf_parallel.fit(X_train, y_train)
    probs_parallel = cal_clf_parallel.predict_proba(X_test)

    cal_clf_sequential = CalibratedClassifierCV(
        estimator, method=method, n_jobs=1, ensemble=ensemble
    )
    cal_clf_sequential.fit(X_train, y_train)
    probs_sequential = cal_clf_sequential.predict_proba(X_test)

    assert_allclose(probs_parallel, probs_sequential)


@pytest.mark.parametrize("method", ["sigmoid", "isotonic"])
@pytest.mark.parametrize("ensemble", [True, False])
# increase the number of RNG seeds to assess the statistical stability of this
# test:
@pytest.mark.parametrize("seed", range(2))
def test_calibration_multiclass(method, ensemble, seed):
    def multiclass_brier(y_true, proba_pred, n_classes):
        Y_onehot = np.eye(n_classes)[y_true]
        return np.sum((Y_onehot - proba_pred) ** 2) / Y_onehot.shape[0]

    # Test calibration for multiclass with classifier that implements
    # only decision function.
    clf = LinearSVC(random_state=7)
    X, y = make_blobs(
        n_samples=500, n_features=100, random_state=seed, centers=10, cluster_std=15.0
    )

    # Use an unbalanced dataset by collapsing 8 clusters into one class
    # to make the naive calibration based on a softmax more unlikely
    # to work.
    y[y > 2] = 2
    n_classes = np.unique(y).shape[0]
    X_train, y_train = X[::2], y[::2]
    X_test, y_test = X[1::2], y[1::2]

    clf.fit(X_train, y_train)

    cal_clf = CalibratedClassifierCV(clf, method=method, cv=5, ensemble=ensemble)
    cal_clf.fit(X_train, y_train)
    probas = cal_clf.predict_proba(X_test)
    # Check probabilities sum to 1
    assert_allclose(np.sum(probas, axis=1), np.ones(len(X_test)))

    # Check that the dataset is not too trivial, otherwise it's hard
    # to get interesting calibration data during the internal
    # cross-validation loop.
    assert 0.65 < clf.score(X_test, y_test) < 0.95

    # Check that the accuracy of the calibrated model is never degraded
    # too much compared to the original classifier.
    assert cal_clf.score(X_test, y_test) > 0.95 * clf.score(X_test, y_test)

    # Check that Brier loss of calibrated classifier is smaller than
    # loss obtained by naively turning OvR decision function to
    # probabilities via a softmax
    uncalibrated_brier = multiclass_brier(
        y_test, softmax(clf.decision_function(X_test)), n_classes=n_classes
    )
    calibrated_brier = multiclass_brier(y_test, probas, n_classes=n_classes)

    assert calibrated_brier < 1.1 * uncalibrated_brier

    # Test that calibration of a multiclass classifier decreases log-loss
    # for RandomForestClassifier
    clf = RandomForestClassifier(n_estimators=30, random_state=42)
    clf.fit(X_train, y_train)
    clf_probs = clf.predict_proba(X_test)
    uncalibrated_brier = multiclass_brier(y_test, clf_probs, n_classes=n_classes)

    cal_clf = CalibratedClassifierCV(clf, method=method, cv=5, ensemble=ensemble)
    cal_clf.fit(X_train, y_train)
    cal_clf_probs = cal_clf.predict_proba(X_test)
    calibrated_brier = multiclass_brier(y_test, cal_clf_probs, n_classes=n_classes)
    assert calibrated_brier < 1.1 * uncalibrated_brier


def test_calibration_zero_probability():
    # Test an edge case where _CalibratedClassifier avoids numerical errors
    # in the multiclass normalization step if all the calibrators output
    # are zero all at once for a given sample and instead fallback to uniform
    # probabilities.
    class ZeroCalibrator:
        # This function is called from _CalibratedClassifier.predict_proba.
        def predict(self, X):
            return np.zeros(X.shape[0])

    X, y = make_blobs(
        n_samples=50, n_features=10, random_state=7, centers=10, cluster_std=15.0
    )
    clf = DummyClassifier().fit(X, y)
    calibrator = ZeroCalibrator()
    cal_clf = _CalibratedClassifier(
        estimator=clf, calibrators=[calibrator], classes=clf.classes_
    )

    probas = cal_clf.predict_proba(X)

    # Check that all probabilities are uniformly 1. / clf.n_classes_
    assert_allclose(probas, 1.0 / clf.n_classes_)


@ignore_warnings(category=FutureWarning)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_calibration_prefit(csr_container):
    """Test calibration for prefitted classifiers"""
    # TODO(1.8): Remove cv="prefit" options here and the @ignore_warnings of the test
    n_samples = 50
    X, y = make_classification(n_samples=3 * n_samples, n_features=6, random_state=42)
    sample_weight = np.random.RandomState(seed=42).uniform(size=y.size)

    X -= X.min()  # MultinomialNB only allows positive X

    # split train and test
    X_train, y_train, sw_train = X[:n_samples], y[:n_samples], sample_weight[:n_samples]
    X_calib, y_calib, sw_calib = (
        X[n_samples : 2 * n_samples],
        y[n_samples : 2 * n_samples],
        sample_weight[n_samples : 2 * n_samples],
    )
    X_test, y_test = X[2 * n_samples :], y[2 * n_samples :]

    # Naive-Bayes
    clf = MultinomialNB()
    # Check error if clf not prefit
    unfit_clf = CalibratedClassifierCV(clf, cv="prefit")
    with pytest.raises(NotFittedError):
        unfit_clf.fit(X_calib, y_calib)

    clf.fit(X_train, y_train, sw_train)
    prob_pos_clf = clf.predict_proba(X_test)[:, 1]

    # Naive Bayes with calibration
    for this_X_calib, this_X_test in [
        (X_calib, X_test),
        (csr_container(X_calib), csr_container(X_test)),
    ]:
        for method in ["isotonic", "sigmoid"]:
            cal_clf_prefit = CalibratedClassifierCV(clf, method=method, cv="prefit")
            cal_clf_frozen = CalibratedClassifierCV(FrozenEstimator(clf), method=method)

            for sw in [sw_calib, None]:
                cal_clf_prefit.fit(this_X_calib, y_calib, sample_weight=sw)
                cal_clf_frozen.fit(this_X_calib, y_calib, sample_weight=sw)

                y_prob_prefit = cal_clf_prefit.predict_proba(this_X_test)
                y_prob_frozen = cal_clf_frozen.predict_proba(this_X_test)
                y_pred_prefit = cal_clf_prefit.predict(this_X_test)
                y_pred_frozen = cal_clf_frozen.predict(this_X_test)
                prob_pos_cal_clf_prefit = y_prob_prefit[:, 1]
                prob_pos_cal_clf_frozen = y_prob_frozen[:, 1]
                assert_array_equal(y_pred_prefit, y_pred_frozen)
                assert_array_equal(
                    y_pred_prefit, np.array([0, 1])[np.argmax(y_prob_prefit, axis=1)]
                )
                assert brier_score_loss(y_test, prob_pos_clf) > brier_score_loss(
                    y_test, prob_pos_cal_clf_frozen
                )


@pytest.mark.parametrize("method", ["sigmoid", "isotonic"])
def test_calibration_ensemble_false(data, method):
    # Test that `ensemble=False` is the same as using predictions from
    # `cross_val_predict` to train calibrator.
    X, y = data
    clf = LinearSVC(random_state=7)

    cal_clf = CalibratedClassifierCV(clf, method=method, cv=3, ensemble=False)
    cal_clf.fit(X, y)
    cal_probas = cal_clf.predict_proba(X)

    # Get probas manually
    unbiased_preds = cross_val_predict(clf, X, y, cv=3, method="decision_function")
    if method == "isotonic":
        calibrator = IsotonicRegression(out_of_bounds="clip")
    else:
        calibrator = _SigmoidCalibration()
    calibrator.fit(unbiased_preds, y)
    # Use `clf` fit on all data
    clf.fit(X, y)
    clf_df = clf.decision_function(X)
    manual_probas = calibrator.predict(clf_df)
    assert_allclose(cal_probas[:, 1], manual_probas)


def test_sigmoid_calibration():
    """Test calibration values with Platt sigmoid model"""
    exF = np.array([5, -4, 1.0])
    exY = np.array([1, -1, -1])
    # computed from my python port of the C++ code in LibSVM
    AB_lin_libsvm = np.array([-0.20261354391187855, 0.65236314980010512])
    assert_array_almost_equal(AB_lin_libsvm, _sigmoid_calibration(exF, exY), 3)
    lin_prob = 1.0 / (1.0 + np.exp(AB_lin_libsvm[0] * exF + AB_lin_libsvm[1]))
    sk_prob = _SigmoidCalibration().fit(exF, exY).predict(exF)
    assert_array_almost_equal(lin_prob, sk_prob, 6)

    # check that _SigmoidCalibration().fit only accepts 1d array or 2d column
    # arrays
    with pytest.raises(ValueError):
        _SigmoidCalibration().fit(np.vstack((exF, exF)), exY)


def test_calibration_curve():
    """Check calibration_curve function"""
    y_true = np.array([0, 0, 0, 1, 1, 1])
    y_pred = np.array([0.0, 0.1, 0.2, 0.8, 0.9, 1.0])
    prob_true, prob_pred = calibration_curve(y_true, y_pred, n_bins=2)
    assert len(prob_true) == len(prob_pred)
    assert len(prob_true) == 2
    assert_almost_equal(prob_true, [0, 1])
    assert_almost_equal(prob_pred, [0.1, 0.9])

    # Probabilities outside [0, 1] should not be accepted at all.
    with pytest.raises(ValueError):
        calibration_curve([1], [-0.1])

    # test that quantiles work as expected
    y_true2 = np.array([0, 0, 0, 0, 1, 1])
    y_pred2 = np.array([0.0, 0.1, 0.2, 0.5, 0.9, 1.0])
    prob_true_quantile, prob_pred_quantile = calibration_curve(
        y_true2, y_pred2, n_bins=2, strategy="quantile"
    )

    assert len(prob_true_quantile) == len(prob_pred_quantile)
    assert len(prob_true_quantile) == 2
    assert_almost_equal(prob_true_quantile, [0, 2 / 3])
    assert_almost_equal(prob_pred_quantile, [0.1, 0.8])

    # Check that error is raised when invalid strategy is selected
    with pytest.raises(ValueError):
        calibration_curve(y_true2, y_pred2, strategy="percentile")


@pytest.mark.parametrize("ensemble", [True, False])
def test_calibration_nan_imputer(ensemble):
    """Test that calibration can accept nan"""
    X, y = make_classification(
        n_samples=10, n_features=2, n_informative=2, n_redundant=0, random_state=42
    )
    X[0, 0] = np.nan
    clf = Pipeline(
        [("imputer", SimpleImputer()), ("rf", RandomForestClassifier(n_estimators=1))]
    )
    clf_c = CalibratedClassifierCV(clf, cv=2, method="isotonic", ensemble=ensemble)
    clf_c.fit(X, y)
    clf_c.predict(X)


@pytest.mark.parametrize("ensemble", [True, False])
def test_calibration_prob_sum(ensemble):
    # Test that sum of probabilities is (max) 1. A non-regression test for
    # issue #7796 - when test has fewer classes than train
    X, _ = make_classification(n_samples=10, n_features=5, n_classes=2)
    y = [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
    clf = LinearSVC(C=1.0, random_state=7)
    # In the first and last fold, test will have 1 class while train will have 2
    clf_prob = CalibratedClassifierCV(
        clf, method="sigmoid", cv=KFold(n_splits=3), ensemble=ensemble
    )
    clf_prob.fit(X, y)
    assert_allclose(clf_prob.predict_proba(X).sum(axis=1), 1.0)


@pytest.mark.parametrize("ensemble", [True, False])
def test_calibration_less_classes(ensemble):
    # Test to check calibration works fine when train set in a test-train
    # split does not contain all classes
    # In 1st split, train is missing class 0
    # In 3rd split, train is missing class 3
    X = np.random.randn(12, 5)
    y = [0, 0, 0, 1] + [1, 1, 2, 2] + [2, 3, 3, 3]
    clf = DecisionTreeClassifier(random_state=7)
    cal_clf = CalibratedClassifierCV(
        clf, method="sigmoid", cv=KFold(3), ensemble=ensemble
    )
    cal_clf.fit(X, y)

    if ensemble:
        classes = np.arange(4)
        for calib_i, class_i in zip([0, 2], [0, 3]):
            proba = cal_clf.calibrated_classifiers_[calib_i].predict_proba(X)
            # Check that the unobserved class has proba=0
            assert_array_equal(proba[:, class_i], np.zeros(len(y)))
            # Check for all other classes proba>0
            assert np.all(proba[:, classes != class_i] > 0)

    # When `ensemble=False`, `cross_val_predict` is used to compute predictions
    # to fit only one `calibrated_classifiers_`
    else:
        proba = cal_clf.calibrated_classifiers_[0].predict_proba(X)
        assert_array_almost_equal(proba.sum(axis=1), np.ones(proba.shape[0]))


@pytest.mark.parametrize(
    "X",
    [
        np.random.RandomState(42).randn(15, 5, 2),
        np.random.RandomState(42).randn(15, 5, 2, 6),
    ],
)
def test_calibration_accepts_ndarray(X):
    """Test that calibration accepts n-dimensional arrays as input"""
    y = [1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0]

    class MockTensorClassifier(ClassifierMixin, BaseEstimator):
        """A toy estimator that accepts tensor inputs"""

        def fit(self, X, y):
            self.classes_ = np.unique(y)
            return self

        def decision_function(self, X):
            # toy decision function that just needs to have the right shape:
            return X.reshape(X.shape[0], -1).sum(axis=1)

    calibrated_clf = CalibratedClassifierCV(MockTensorClassifier())
    # we should be able to fit this classifier with no error
    calibrated_clf.fit(X, y)


@pytest.fixture
def dict_data():
    dict_data = [
        {"state": "NY", "age": "adult"},
        {"state": "TX", "age": "adult"},
        {"state": "VT", "age": "child"},
        {"state": "CT", "age": "adult"},
        {"state": "BR", "age": "child"},
    ]
    text_labels = [1, 0, 1, 1, 0]
    return dict_data, text_labels


@pytest.fixture
def dict_data_pipeline(dict_data):
    X, y = dict_data
    pipeline_prefit = Pipeline(
        [("vectorizer", DictVectorizer()), ("clf", RandomForestClassifier())]
    )
    return pipeline_prefit.fit(X, y)


def test_calibration_dict_pipeline(dict_data, dict_data_pipeline):
    """Test that calibration works in prefit pipeline with transformer

    `X` is not array-like, sparse matrix or dataframe at the start.
    See https://github.com/scikit-learn/scikit-learn/issues/8710

    Also test it can predict without running into validation errors.
    See https://github.com/scikit-learn/scikit-learn/issues/19637
    """
    X, y = dict_data
    clf = dict_data_pipeline
    calib_clf = CalibratedClassifierCV(FrozenEstimator(clf), cv=2)
    calib_clf.fit(X, y)
    # Check attributes are obtained from fitted estimator
    assert_array_equal(calib_clf.classes_, clf.classes_)

    # Neither the pipeline nor the calibration meta-estimator
    # expose the n_features_in_ check on this kind of data.
    assert not hasattr(clf, "n_features_in_")
    assert not hasattr(calib_clf, "n_features_in_")

    # Ensure that no error is thrown with predict and predict_proba
    calib_clf.predict(X)
    calib_clf.predict_proba(X)


@pytest.mark.parametrize(
    "clf, cv",
    [
        pytest.param(LinearSVC(C=1), 2),
        pytest.param(LinearSVC(C=1), "prefit"),
    ],
)
def test_calibration_attributes(clf, cv):
    # Check that `n_features_in_` and `classes_` attributes created properly
    X, y = make_classification(n_samples=10, n_features=5, n_classes=2, random_state=7)
    if cv == "prefit":
        clf = clf.fit(X, y)
    calib_clf = CalibratedClassifierCV(clf, cv=cv)
    calib_clf.fit(X, y)

    if cv == "prefit":
        assert_array_equal(calib_clf.classes_, clf.classes_)
        assert calib_clf.n_features_in_ == clf.n_features_in_
    else:
        classes = LabelEncoder().fit(y).classes_
        assert_array_equal(calib_clf.classes_, classes)
        assert calib_clf.n_features_in_ == X.shape[1]


def test_calibration_inconsistent_prefit_n_features_in():
    # Check that `n_features_in_` from prefit base estimator
    # is consistent with training set
    X, y = make_classification(n_samples=10, n_features=5, n_classes=2, random_state=7)
    clf = LinearSVC(C=1).fit(X, y)
    calib_clf = CalibratedClassifierCV(FrozenEstimator(clf))

    msg = "X has 3 features, but LinearSVC is expecting 5 features as input."
    with pytest.raises(ValueError, match=msg):
        calib_clf.fit(X[:, :3], y)


def test_calibration_votingclassifier():
    # Check that `CalibratedClassifier` works with `VotingClassifier`.
    # The method `predict_proba` from `VotingClassifier` is dynamically
    # defined via a property that only works when voting="soft".
    X, y = make_classification(n_samples=10, n_features=5, n_classes=2, random_state=7)
    vote = VotingClassifier(
        estimators=[("lr" + str(i), LogisticRegression()) for i in range(3)],
        voting="soft",
    )
    vote.fit(X, y)

    calib_clf = CalibratedClassifierCV(estimator=FrozenEstimator(vote))
    # smoke test: should not raise an error
    calib_clf.fit(X, y)


@pytest.fixture(scope="module")
def iris_data():
    return load_iris(return_X_y=True)


@pytest.fixture(scope="module")
def iris_data_binary(iris_data):
    X, y = iris_data
    return X[y < 2], y[y < 2]


@pytest.mark.parametrize("n_bins", [5, 10])
@pytest.mark.parametrize("strategy", ["uniform", "quantile"])
def test_calibration_display_compute(pyplot, iris_data_binary, n_bins, strategy):
    # Ensure `CalibrationDisplay.from_predictions` and `calibration_curve`
    # compute the same results. Also checks attributes of the
    # CalibrationDisplay object.
    X, y = iris_data_binary

    lr = LogisticRegression().fit(X, y)

    viz = CalibrationDisplay.from_estimator(
        lr, X, y, n_bins=n_bins, strategy=strategy, alpha=0.8
    )

    y_prob = lr.predict_proba(X)[:, 1]
    prob_true, prob_pred = calibration_curve(
        y, y_prob, n_bins=n_bins, strategy=strategy
    )

    assert_allclose(viz.prob_true, prob_true)
    assert_allclose(viz.prob_pred, prob_pred)
    assert_allclose(viz.y_prob, y_prob)

    assert viz.estimator_name == "LogisticRegression"

    # cannot fail thanks to pyplot fixture
    import matplotlib as mpl  # noqa

    assert isinstance(viz.line_, mpl.lines.Line2D)
    assert viz.line_.get_alpha() == 0.8
    assert isinstance(viz.ax_, mpl.axes.Axes)
    assert isinstance(viz.figure_, mpl.figure.Figure)

    assert viz.ax_.get_xlabel() == "Mean predicted probability (Positive class: 1)"
    assert viz.ax_.get_ylabel() == "Fraction of positives (Positive class: 1)"

    expected_legend_labels = ["LogisticRegression", "Perfectly calibrated"]
    legend_labels = viz.ax_.get_legend().get_texts()
    assert len(legend_labels) == len(expected_legend_labels)
    for labels in legend_labels:
        assert labels.get_text() in expected_legend_labels


def test_plot_calibration_curve_pipeline(pyplot, iris_data_binary):
    # Ensure pipelines are supported by CalibrationDisplay.from_estimator
    X, y = iris_data_binary
    clf = make_pipeline(StandardScaler(), LogisticRegression())
    clf.fit(X, y)
    viz = CalibrationDisplay.from_estimator(clf, X, y)

    expected_legend_labels = [viz.estimator_name, "Perfectly calibrated"]
    legend_labels = viz.ax_.get_legend().get_texts()
    assert len(legend_labels) == len(expected_legend_labels)
    for labels in legend_labels:
        assert labels.get_text() in expected_legend_labels


@pytest.mark.parametrize(
    "name, expected_label", [(None, "_line1"), ("my_est", "my_est")]
)
def test_calibration_display_default_labels(pyplot, name, expected_label):
    prob_true = np.array([0, 1, 1, 0])
    prob_pred = np.array([0.2, 0.8, 0.8, 0.4])
    y_prob = np.array([])

    viz = CalibrationDisplay(prob_true, prob_pred, y_prob, estimator_name=name)
    viz.plot()

    expected_legend_labels = [] if name is None else [name]
    expected_legend_labels.append("Perfectly calibrated")
    legend_labels = viz.ax_.get_legend().get_texts()
    assert len(legend_labels) == len(expected_legend_labels)
    for labels in legend_labels:
        assert labels.get_text() in expected_legend_labels


def test_calibration_display_label_class_plot(pyplot):
    # Checks that when instantiating `CalibrationDisplay` class then calling
    # `plot`, `self.estimator_name` is the one given in `plot`
    prob_true = np.array([0, 1, 1, 0])
    prob_pred = np.array([0.2, 0.8, 0.8, 0.4])
    y_prob = np.array([])

    name = "name one"
    viz = CalibrationDisplay(prob_true, prob_pred, y_prob, estimator_name=name)
    assert viz.estimator_name == name
    name = "name two"
    viz.plot(name=name)

    expected_legend_labels = [name, "Perfectly calibrated"]
    legend_labels = viz.ax_.get_legend().get_texts()
    assert len(legend_labels) == len(expected_legend_labels)
    for labels in legend_labels:
        assert labels.get_text() in expected_legend_labels


@pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
def test_calibration_display_name_multiple_calls(
    constructor_name, pyplot, iris_data_binary
):
    # Check that the `name` used when calling
    # `CalibrationDisplay.from_predictions` or
    # `CalibrationDisplay.from_estimator` is used when multiple
    # `CalibrationDisplay.viz.plot()` calls are made.
    X, y = iris_data_binary
    clf_name = "my hand-crafted name"
    clf = LogisticRegression().fit(X, y)
    y_prob = clf.predict_proba(X)[:, 1]

    constructor = getattr(CalibrationDisplay, constructor_name)
    params = (clf, X, y) if constructor_name == "from_estimator" else (y, y_prob)

    viz = constructor(*params, name=clf_name)
    assert viz.estimator_name == clf_name
    pyplot.close("all")
    viz.plot()

    expected_legend_labels = [clf_name, "Perfectly calibrated"]
    legend_labels = viz.ax_.get_legend().get_texts()
    assert len(legend_labels) == len(expected_legend_labels)
    for labels in legend_labels:
        assert labels.get_text() in expected_legend_labels

    pyplot.close("all")
    clf_name = "another_name"
    viz.plot(name=clf_name)
    assert len(legend_labels) == len(expected_legend_labels)
    for labels in legend_labels:
        assert labels.get_text() in expected_legend_labels


def test_calibration_display_ref_line(pyplot, iris_data_binary):
    # Check that `ref_line` only appears once
    X, y = iris_data_binary
    lr = LogisticRegression().fit(X, y)
    dt = DecisionTreeClassifier().fit(X, y)

    viz = CalibrationDisplay.from_estimator(lr, X, y)
    viz2 = CalibrationDisplay.from_estimator(dt, X, y, ax=viz.ax_)

    labels = viz2.ax_.get_legend_handles_labels()[1]
    assert labels.count("Perfectly calibrated") == 1


@pytest.mark.parametrize("dtype_y_str", [str, object])
def test_calibration_curve_pos_label_error_str(dtype_y_str):
    """Check error message when a `pos_label` is not specified with `str` targets."""
    rng = np.random.RandomState(42)
    y1 = np.array(["spam"] * 3 + ["eggs"] * 2, dtype=dtype_y_str)
    y2 = rng.randint(0, 2, size=y1.size)

    err_msg = (
        "y_true takes value in {'eggs', 'spam'} and pos_label is not "
        "specified: either make y_true take value in {0, 1} or {-1, 1} or "
        "pass pos_label explicitly"
    )
    with pytest.raises(ValueError, match=err_msg):
        calibration_curve(y1, y2)


@pytest.mark.parametrize("dtype_y_str", [str, object])
def test_calibration_curve_pos_label(dtype_y_str):
    """Check the behaviour when passing explicitly `pos_label`."""
    y_true = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1])
    classes = np.array(["spam", "egg"], dtype=dtype_y_str)
    y_true_str = classes[y_true]
    y_pred = np.array([0.1, 0.2, 0.3, 0.4, 0.65, 0.7, 0.8, 0.9, 1.0])

    # default case
    prob_true, _ = calibration_curve(y_true, y_pred, n_bins=4)
    assert_allclose(prob_true, [0, 0.5, 1, 1])
    # if `y_true` contains `str`, then `pos_label` is required
    prob_true, _ = calibration_curve(y_true_str, y_pred, n_bins=4, pos_label="egg")
    assert_allclose(prob_true, [0, 0.5, 1, 1])

    prob_true, _ = calibration_curve(y_true, 1 - y_pred, n_bins=4, pos_label=0)
    assert_allclose(prob_true, [0, 0, 0.5, 1])
    prob_true, _ = calibration_curve(y_true_str, 1 - y_pred, n_bins=4, pos_label="spam")
    assert_allclose(prob_true, [0, 0, 0.5, 1])


@pytest.mark.parametrize(
    "kwargs",
    [
        {"c": "red", "lw": 2, "ls": "-."},
        {"color": "red", "linewidth": 2, "linestyle": "-."},
    ],
)
def test_calibration_display_kwargs(pyplot, iris_data_binary, kwargs):
    """Check that matplotlib aliases are handled."""
    X, y = iris_data_binary

    lr = LogisticRegression().fit(X, y)
    viz = CalibrationDisplay.from_estimator(lr, X, y, **kwargs)

    assert viz.line_.get_color() == "red"
    assert viz.line_.get_linewidth() == 2
    assert viz.line_.get_linestyle() == "-."


@pytest.mark.parametrize("pos_label, expected_pos_label", [(None, 1), (0, 0), (1, 1)])
def test_calibration_display_pos_label(
    pyplot, iris_data_binary, pos_label, expected_pos_label
):
    """Check the behaviour of `pos_label` in the `CalibrationDisplay`."""
    X, y = iris_data_binary

    lr = LogisticRegression().fit(X, y)
    viz = CalibrationDisplay.from_estimator(lr, X, y, pos_label=pos_label)

    y_prob = lr.predict_proba(X)[:, expected_pos_label]
    prob_true, prob_pred = calibration_curve(y, y_prob, pos_label=pos_label)

    assert_allclose(viz.prob_true, prob_true)
    assert_allclose(viz.prob_pred, prob_pred)
    assert_allclose(viz.y_prob, y_prob)

    assert (
        viz.ax_.get_xlabel()
        == f"Mean predicted probability (Positive class: {expected_pos_label})"
    )
    assert (
        viz.ax_.get_ylabel()
        == f"Fraction of positives (Positive class: {expected_pos_label})"
    )

    expected_legend_labels = [lr.__class__.__name__, "Perfectly calibrated"]
    legend_labels = viz.ax_.get_legend().get_texts()
    assert len(legend_labels) == len(expected_legend_labels)
    for labels in legend_labels:
        assert labels.get_text() in expected_legend_labels


@pytest.mark.parametrize("method", ["sigmoid", "isotonic"])
@pytest.mark.parametrize("ensemble", [True, False])
def test_calibrated_classifier_cv_double_sample_weights_equivalence(method, ensemble):
    """Check that passing repeating twice the dataset `X` is equivalent to
    passing a `sample_weight` with a factor 2."""
    X, y = load_iris(return_X_y=True)
    # Scale the data to avoid any convergence issue
    X = StandardScaler().fit_transform(X)
    # Only use 2 classes
    X, y = X[:100], y[:100]
    sample_weight = np.ones_like(y) * 2

    # Interlace the data such that a 2-fold cross-validation will be equivalent
    # to using the original dataset with a sample weights of 2
    X_twice = np.zeros((X.shape[0] * 2, X.shape[1]), dtype=X.dtype)
    X_twice[::2, :] = X
    X_twice[1::2, :] = X
    y_twice = np.zeros(y.shape[0] * 2, dtype=y.dtype)
    y_twice[::2] = y
    y_twice[1::2] = y

    estimator = LogisticRegression()
    calibrated_clf_without_weights = CalibratedClassifierCV(
        estimator,
        method=method,
        ensemble=ensemble,
        cv=2,
    )
    calibrated_clf_with_weights = clone(calibrated_clf_without_weights)

    calibrated_clf_with_weights.fit(X, y, sample_weight=sample_weight)
    calibrated_clf_without_weights.fit(X_twice, y_twice)

    # Check that the underlying fitted estimators have the same coefficients
    for est_with_weights, est_without_weights in zip(
        calibrated_clf_with_weights.calibrated_classifiers_,
        calibrated_clf_without_weights.calibrated_classifiers_,
    ):
        assert_allclose(
            est_with_weights.estimator.coef_,
            est_without_weights.estimator.coef_,
        )

    # Check that the predictions are the same
    y_pred_with_weights = calibrated_clf_with_weights.predict_proba(X)
    y_pred_without_weights = calibrated_clf_without_weights.predict_proba(X)

    assert_allclose(y_pred_with_weights, y_pred_without_weights)


@pytest.mark.parametrize("fit_params_type", ["list", "array"])
def test_calibration_with_fit_params(fit_params_type, data):
    """Tests that fit_params are passed to the underlying base estimator.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/12384
    """
    X, y = data
    fit_params = {
        "a": _convert_container(y, fit_params_type),
        "b": _convert_container(y, fit_params_type),
    }

    clf = CheckingClassifier(expected_fit_params=["a", "b"])
    pc_clf = CalibratedClassifierCV(clf)

    pc_clf.fit(X, y, **fit_params)


@pytest.mark.parametrize(
    "sample_weight",
    [
        [1.0] * N_SAMPLES,
        np.ones(N_SAMPLES),
    ],
)
def test_calibration_with_sample_weight_estimator(sample_weight, data):
    """Tests that sample_weight is passed to the underlying base
    estimator.
    """
    X, y = data
    clf = CheckingClassifier(expected_sample_weight=True)
    pc_clf = CalibratedClassifierCV(clf)

    pc_clf.fit(X, y, sample_weight=sample_weight)


def test_calibration_without_sample_weight_estimator(data):
    """Check that even if the estimator doesn't support
    sample_weight, fitting with sample_weight still works.

    There should be a warning, since the sample_weight is not passed
    on to the estimator.
    """
    X, y = data
    sample_weight = np.ones_like(y)

    class ClfWithoutSampleWeight(CheckingClassifier):
        def fit(self, X, y, **fit_params):
            assert "sample_weight" not in fit_params
            return super().fit(X, y, **fit_params)

    clf = ClfWithoutSampleWeight()
    pc_clf = CalibratedClassifierCV(clf)

    with pytest.warns(UserWarning):
        pc_clf.fit(X, y, sample_weight=sample_weight)


def test_calibration_with_non_sample_aligned_fit_param(data):
    """Check that CalibratedClassifierCV does not enforce sample alignment
    for fit parameters."""

    class TestClassifier(LogisticRegression):
        def fit(self, X, y, sample_weight=None, fit_param=None):
            assert fit_param is not None
            return super().fit(X, y, sample_weight=sample_weight)

    CalibratedClassifierCV(estimator=TestClassifier()).fit(
        *data, fit_param=np.ones(len(data[1]) + 1)
    )


def test_calibrated_classifier_cv_works_with_large_confidence_scores(
    global_random_seed,
):
    """Test that :class:`CalibratedClassifierCV` works with large confidence
    scores when using the `sigmoid` method, particularly with the
    :class:`SGDClassifier`.

    Non-regression test for issue #26766.
    """
    prob = 0.67
    n = 1000
    random_noise = np.random.default_rng(global_random_seed).normal(size=n)

    y = np.array([1] * int(n * prob) + [0] * (n - int(n * prob)))
    X = 1e5 * y.reshape((-1, 1)) + random_noise

    # Check that the decision function of SGDClassifier produces predicted
    # values that are quite large, for the data under consideration.
    cv = check_cv(cv=None, y=y, classifier=True)
    indices = cv.split(X, y)
    for train, test in indices:
        X_train, y_train = X[train], y[train]
        X_test = X[test]
        sgd_clf = SGDClassifier(loss="squared_hinge", random_state=global_random_seed)
        sgd_clf.fit(X_train, y_train)
        predictions = sgd_clf.decision_function(X_test)
        assert (predictions > 1e4).any()

    # Compare the CalibratedClassifierCV using the sigmoid method with the
    # CalibratedClassifierCV using the isotonic method. The isotonic method
    # is used for comparison because it is numerically stable.
    clf_sigmoid = CalibratedClassifierCV(
        SGDClassifier(loss="squared_hinge", random_state=global_random_seed),
        method="sigmoid",
    )
    score_sigmoid = cross_val_score(clf_sigmoid, X, y, scoring="roc_auc")

    # The isotonic method is used for comparison because it is numerically
    # stable.
    clf_isotonic = CalibratedClassifierCV(
        SGDClassifier(loss="squared_hinge", random_state=global_random_seed),
        method="isotonic",
    )
    score_isotonic = cross_val_score(clf_isotonic, X, y, scoring="roc_auc")

    # The AUC score should be the same because it is invariant under
    # strictly monotonic conditions
    assert_allclose(score_sigmoid, score_isotonic)


def test_sigmoid_calibration_max_abs_prediction_threshold(global_random_seed):
    random_state = np.random.RandomState(seed=global_random_seed)
    n = 100
    y = random_state.randint(0, 2, size=n)

    # Check that for small enough predictions ranging from -2 to 2, the
    # threshold value has no impact on the outcome
    predictions_small = random_state.uniform(low=-2, high=2, size=100)

    # Using a threshold lower than the maximum absolute value of the
    # predictions enables internal re-scaling by max(abs(predictions_small)).
    threshold_1 = 0.1
    a1, b1 = _sigmoid_calibration(
        predictions=predictions_small,
        y=y,
        max_abs_prediction_threshold=threshold_1,
    )

    # Using a larger threshold disables rescaling.
    threshold_2 = 10
    a2, b2 = _sigmoid_calibration(
        predictions=predictions_small,
        y=y,
        max_abs_prediction_threshold=threshold_2,
    )

    # Using default threshold of 30 also disables the scaling.
    a3, b3 = _sigmoid_calibration(
        predictions=predictions_small,
        y=y,
    )

    # Depends on the tolerance of the underlying quasy-newton solver which is
    # not too strict by default.
    atol = 1e-6
    assert_allclose(a1, a2, atol=atol)
    assert_allclose(a2, a3, atol=atol)
    assert_allclose(b1, b2, atol=atol)
    assert_allclose(b2, b3, atol=atol)


def test_float32_predict_proba(data):
    """Check that CalibratedClassifierCV works with float32 predict proba.

    Non-regression test for gh-28245.
    """

    class DummyClassifer32(DummyClassifier):
        def predict_proba(self, X):
            return super().predict_proba(X).astype(np.float32)

    model = DummyClassifer32()
    calibrator = CalibratedClassifierCV(model)
    # Does not raise an error
    calibrator.fit(*data)


def test_error_less_class_samples_than_folds():
    """Check that CalibratedClassifierCV works with string targets.

    non-regression test for issue #28841.
    """
    X = np.random.normal(size=(20, 3))
    y = ["a"] * 10 + ["b"] * 10

    CalibratedClassifierCV(cv=3).fit(X, y)