Sam Chaudry
Upload folder using huggingface_hub
7885a28 verified
raw
history blame
22.1 kB
import warnings
import numpy as np
import pytest
import scipy.sparse as sp
from sklearn.base import clone
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.exceptions import NotFittedError
from sklearn.utils._testing import (
assert_almost_equal,
assert_array_almost_equal,
assert_array_equal,
)
from sklearn.utils.fixes import CSC_CONTAINERS
from sklearn.utils.stats import _weighted_percentile
def _check_predict_proba(clf, X, y):
proba = clf.predict_proba(X)
# We know that we can have division by zero
with warnings.catch_warnings():
warnings.filterwarnings("ignore", "divide by zero encountered in log")
log_proba = clf.predict_log_proba(X)
y = np.atleast_1d(y)
if y.ndim == 1:
y = np.reshape(y, (-1, 1))
n_outputs = y.shape[1]
n_samples = len(X)
if n_outputs == 1:
proba = [proba]
log_proba = [log_proba]
for k in range(n_outputs):
assert proba[k].shape[0] == n_samples
assert proba[k].shape[1] == len(np.unique(y[:, k]))
assert_array_almost_equal(proba[k].sum(axis=1), np.ones(len(X)))
# We know that we can have division by zero
with warnings.catch_warnings():
warnings.filterwarnings("ignore", "divide by zero encountered in log")
assert_array_almost_equal(np.log(proba[k]), log_proba[k])
def _check_behavior_2d(clf):
# 1d case
X = np.array([[0], [0], [0], [0]]) # ignored
y = np.array([1, 2, 1, 1])
est = clone(clf)
est.fit(X, y)
y_pred = est.predict(X)
assert y.shape == y_pred.shape
# 2d case
y = np.array([[1, 0], [2, 0], [1, 0], [1, 3]])
est = clone(clf)
est.fit(X, y)
y_pred = est.predict(X)
assert y.shape == y_pred.shape
def _check_behavior_2d_for_constant(clf):
# 2d case only
X = np.array([[0], [0], [0], [0]]) # ignored
y = np.array([[1, 0, 5, 4, 3], [2, 0, 1, 2, 5], [1, 0, 4, 5, 2], [1, 3, 3, 2, 0]])
est = clone(clf)
est.fit(X, y)
y_pred = est.predict(X)
assert y.shape == y_pred.shape
def _check_equality_regressor(statistic, y_learn, y_pred_learn, y_test, y_pred_test):
assert_array_almost_equal(np.tile(statistic, (y_learn.shape[0], 1)), y_pred_learn)
assert_array_almost_equal(np.tile(statistic, (y_test.shape[0], 1)), y_pred_test)
def test_feature_names_in_and_n_features_in_(global_random_seed, n_samples=10):
pd = pytest.importorskip("pandas")
random_state = np.random.RandomState(seed=global_random_seed)
X = pd.DataFrame([[0]] * n_samples, columns=["feature_1"])
y = random_state.rand(n_samples)
est = DummyRegressor().fit(X, y)
assert hasattr(est, "feature_names_in_")
assert hasattr(est, "n_features_in_")
est = DummyClassifier().fit(X, y)
assert hasattr(est, "feature_names_in_")
assert hasattr(est, "n_features_in_")
def test_most_frequent_and_prior_strategy():
X = [[0], [0], [0], [0]] # ignored
y = [1, 2, 1, 1]
for strategy in ("most_frequent", "prior"):
clf = DummyClassifier(strategy=strategy, random_state=0)
clf.fit(X, y)
assert_array_equal(clf.predict(X), np.ones(len(X)))
_check_predict_proba(clf, X, y)
if strategy == "prior":
assert_array_almost_equal(
clf.predict_proba([X[0]]), clf.class_prior_.reshape((1, -1))
)
else:
assert_array_almost_equal(
clf.predict_proba([X[0]]), clf.class_prior_.reshape((1, -1)) > 0.5
)
def test_most_frequent_and_prior_strategy_with_2d_column_y():
# non-regression test added in
# https://github.com/scikit-learn/scikit-learn/pull/13545
X = [[0], [0], [0], [0]]
y_1d = [1, 2, 1, 1]
y_2d = [[1], [2], [1], [1]]
for strategy in ("most_frequent", "prior"):
clf_1d = DummyClassifier(strategy=strategy, random_state=0)
clf_2d = DummyClassifier(strategy=strategy, random_state=0)
clf_1d.fit(X, y_1d)
clf_2d.fit(X, y_2d)
assert_array_equal(clf_1d.predict(X), clf_2d.predict(X))
def test_most_frequent_and_prior_strategy_multioutput():
X = [[0], [0], [0], [0]] # ignored
y = np.array([[1, 0], [2, 0], [1, 0], [1, 3]])
n_samples = len(X)
for strategy in ("prior", "most_frequent"):
clf = DummyClassifier(strategy=strategy, random_state=0)
clf.fit(X, y)
assert_array_equal(
clf.predict(X),
np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))]),
)
_check_predict_proba(clf, X, y)
_check_behavior_2d(clf)
def test_stratified_strategy(global_random_seed):
X = [[0]] * 5 # ignored
y = [1, 2, 1, 1, 2]
clf = DummyClassifier(strategy="stratified", random_state=global_random_seed)
clf.fit(X, y)
X = [[0]] * 500
y_pred = clf.predict(X)
p = np.bincount(y_pred) / float(len(X))
assert_almost_equal(p[1], 3.0 / 5, decimal=1)
assert_almost_equal(p[2], 2.0 / 5, decimal=1)
_check_predict_proba(clf, X, y)
def test_stratified_strategy_multioutput(global_random_seed):
X = [[0]] * 5 # ignored
y = np.array([[2, 1], [2, 2], [1, 1], [1, 2], [1, 1]])
clf = DummyClassifier(strategy="stratified", random_state=global_random_seed)
clf.fit(X, y)
X = [[0]] * 500
y_pred = clf.predict(X)
for k in range(y.shape[1]):
p = np.bincount(y_pred[:, k]) / float(len(X))
assert_almost_equal(p[1], 3.0 / 5, decimal=1)
assert_almost_equal(p[2], 2.0 / 5, decimal=1)
_check_predict_proba(clf, X, y)
_check_behavior_2d(clf)
def test_uniform_strategy(global_random_seed):
X = [[0]] * 4 # ignored
y = [1, 2, 1, 1]
clf = DummyClassifier(strategy="uniform", random_state=global_random_seed)
clf.fit(X, y)
X = [[0]] * 500
y_pred = clf.predict(X)
p = np.bincount(y_pred) / float(len(X))
assert_almost_equal(p[1], 0.5, decimal=1)
assert_almost_equal(p[2], 0.5, decimal=1)
_check_predict_proba(clf, X, y)
def test_uniform_strategy_multioutput(global_random_seed):
X = [[0]] * 4 # ignored
y = np.array([[2, 1], [2, 2], [1, 2], [1, 1]])
clf = DummyClassifier(strategy="uniform", random_state=global_random_seed)
clf.fit(X, y)
X = [[0]] * 500
y_pred = clf.predict(X)
for k in range(y.shape[1]):
p = np.bincount(y_pred[:, k]) / float(len(X))
assert_almost_equal(p[1], 0.5, decimal=1)
assert_almost_equal(p[2], 0.5, decimal=1)
_check_predict_proba(clf, X, y)
_check_behavior_2d(clf)
def test_string_labels():
X = [[0]] * 5
y = ["paris", "paris", "tokyo", "amsterdam", "berlin"]
clf = DummyClassifier(strategy="most_frequent")
clf.fit(X, y)
assert_array_equal(clf.predict(X), ["paris"] * 5)
@pytest.mark.parametrize(
"y,y_test",
[
([2, 1, 1, 1], [2, 2, 1, 1]),
(
np.array([[2, 2], [1, 1], [1, 1], [1, 1]]),
np.array([[2, 2], [2, 2], [1, 1], [1, 1]]),
),
],
)
def test_classifier_score_with_None(y, y_test):
clf = DummyClassifier(strategy="most_frequent")
clf.fit(None, y)
assert clf.score(None, y_test) == 0.5
@pytest.mark.parametrize(
"strategy", ["stratified", "most_frequent", "prior", "uniform", "constant"]
)
def test_classifier_prediction_independent_of_X(strategy, global_random_seed):
y = [0, 2, 1, 1]
X1 = [[0]] * 4
clf1 = DummyClassifier(
strategy=strategy, random_state=global_random_seed, constant=0
)
clf1.fit(X1, y)
predictions1 = clf1.predict(X1)
X2 = [[1]] * 4
clf2 = DummyClassifier(
strategy=strategy, random_state=global_random_seed, constant=0
)
clf2.fit(X2, y)
predictions2 = clf2.predict(X2)
assert_array_equal(predictions1, predictions2)
def test_mean_strategy_regressor(global_random_seed):
random_state = np.random.RandomState(seed=global_random_seed)
X = [[0]] * 4 # ignored
y = random_state.randn(4)
reg = DummyRegressor()
reg.fit(X, y)
assert_array_equal(reg.predict(X), [np.mean(y)] * len(X))
def test_mean_strategy_multioutput_regressor(global_random_seed):
random_state = np.random.RandomState(seed=global_random_seed)
X_learn = random_state.randn(10, 10)
y_learn = random_state.randn(10, 5)
mean = np.mean(y_learn, axis=0).reshape((1, -1))
X_test = random_state.randn(20, 10)
y_test = random_state.randn(20, 5)
# Correctness oracle
est = DummyRegressor()
est.fit(X_learn, y_learn)
y_pred_learn = est.predict(X_learn)
y_pred_test = est.predict(X_test)
_check_equality_regressor(mean, y_learn, y_pred_learn, y_test, y_pred_test)
_check_behavior_2d(est)
def test_regressor_exceptions():
reg = DummyRegressor()
with pytest.raises(NotFittedError):
reg.predict([])
def test_median_strategy_regressor(global_random_seed):
random_state = np.random.RandomState(seed=global_random_seed)
X = [[0]] * 5 # ignored
y = random_state.randn(5)
reg = DummyRegressor(strategy="median")
reg.fit(X, y)
assert_array_equal(reg.predict(X), [np.median(y)] * len(X))
def test_median_strategy_multioutput_regressor(global_random_seed):
random_state = np.random.RandomState(seed=global_random_seed)
X_learn = random_state.randn(10, 10)
y_learn = random_state.randn(10, 5)
median = np.median(y_learn, axis=0).reshape((1, -1))
X_test = random_state.randn(20, 10)
y_test = random_state.randn(20, 5)
# Correctness oracle
est = DummyRegressor(strategy="median")
est.fit(X_learn, y_learn)
y_pred_learn = est.predict(X_learn)
y_pred_test = est.predict(X_test)
_check_equality_regressor(median, y_learn, y_pred_learn, y_test, y_pred_test)
_check_behavior_2d(est)
def test_quantile_strategy_regressor(global_random_seed):
random_state = np.random.RandomState(seed=global_random_seed)
X = [[0]] * 5 # ignored
y = random_state.randn(5)
reg = DummyRegressor(strategy="quantile", quantile=0.5)
reg.fit(X, y)
assert_array_equal(reg.predict(X), [np.median(y)] * len(X))
reg = DummyRegressor(strategy="quantile", quantile=0)
reg.fit(X, y)
assert_array_equal(reg.predict(X), [np.min(y)] * len(X))
reg = DummyRegressor(strategy="quantile", quantile=1)
reg.fit(X, y)
assert_array_equal(reg.predict(X), [np.max(y)] * len(X))
reg = DummyRegressor(strategy="quantile", quantile=0.3)
reg.fit(X, y)
assert_array_equal(reg.predict(X), [np.percentile(y, q=30)] * len(X))
def test_quantile_strategy_multioutput_regressor(global_random_seed):
random_state = np.random.RandomState(seed=global_random_seed)
X_learn = random_state.randn(10, 10)
y_learn = random_state.randn(10, 5)
median = np.median(y_learn, axis=0).reshape((1, -1))
quantile_values = np.percentile(y_learn, axis=0, q=80).reshape((1, -1))
X_test = random_state.randn(20, 10)
y_test = random_state.randn(20, 5)
# Correctness oracle
est = DummyRegressor(strategy="quantile", quantile=0.5)
est.fit(X_learn, y_learn)
y_pred_learn = est.predict(X_learn)
y_pred_test = est.predict(X_test)
_check_equality_regressor(median, y_learn, y_pred_learn, y_test, y_pred_test)
_check_behavior_2d(est)
# Correctness oracle
est = DummyRegressor(strategy="quantile", quantile=0.8)
est.fit(X_learn, y_learn)
y_pred_learn = est.predict(X_learn)
y_pred_test = est.predict(X_test)
_check_equality_regressor(
quantile_values, y_learn, y_pred_learn, y_test, y_pred_test
)
_check_behavior_2d(est)
def test_quantile_invalid():
X = [[0]] * 5 # ignored
y = [0] * 5 # ignored
est = DummyRegressor(strategy="quantile", quantile=None)
err_msg = (
"When using `strategy='quantile', you have to specify the desired quantile"
)
with pytest.raises(ValueError, match=err_msg):
est.fit(X, y)
def test_quantile_strategy_empty_train():
est = DummyRegressor(strategy="quantile", quantile=0.4)
with pytest.raises(IndexError):
est.fit([], [])
def test_constant_strategy_regressor(global_random_seed):
random_state = np.random.RandomState(seed=global_random_seed)
X = [[0]] * 5 # ignored
y = random_state.randn(5)
reg = DummyRegressor(strategy="constant", constant=[43])
reg.fit(X, y)
assert_array_equal(reg.predict(X), [43] * len(X))
reg = DummyRegressor(strategy="constant", constant=43)
reg.fit(X, y)
assert_array_equal(reg.predict(X), [43] * len(X))
# non-regression test for #22478
assert not isinstance(reg.constant, np.ndarray)
def test_constant_strategy_multioutput_regressor(global_random_seed):
random_state = np.random.RandomState(seed=global_random_seed)
X_learn = random_state.randn(10, 10)
y_learn = random_state.randn(10, 5)
# test with 2d array
constants = random_state.randn(5)
X_test = random_state.randn(20, 10)
y_test = random_state.randn(20, 5)
# Correctness oracle
est = DummyRegressor(strategy="constant", constant=constants)
est.fit(X_learn, y_learn)
y_pred_learn = est.predict(X_learn)
y_pred_test = est.predict(X_test)
_check_equality_regressor(constants, y_learn, y_pred_learn, y_test, y_pred_test)
_check_behavior_2d_for_constant(est)
def test_y_mean_attribute_regressor():
X = [[0]] * 5
y = [1, 2, 4, 6, 8]
# when strategy = 'mean'
est = DummyRegressor(strategy="mean")
est.fit(X, y)
assert est.constant_ == np.mean(y)
def test_constants_not_specified_regressor():
X = [[0]] * 5
y = [1, 2, 4, 6, 8]
est = DummyRegressor(strategy="constant")
err_msg = "Constant target value has to be specified"
with pytest.raises(TypeError, match=err_msg):
est.fit(X, y)
def test_constant_size_multioutput_regressor(global_random_seed):
random_state = np.random.RandomState(seed=global_random_seed)
X = random_state.randn(10, 10)
y = random_state.randn(10, 5)
est = DummyRegressor(strategy="constant", constant=[1, 2, 3, 4])
err_msg = r"Constant target value should have shape \(5, 1\)."
with pytest.raises(ValueError, match=err_msg):
est.fit(X, y)
def test_constant_strategy():
X = [[0], [0], [0], [0]] # ignored
y = [2, 1, 2, 2]
clf = DummyClassifier(strategy="constant", random_state=0, constant=1)
clf.fit(X, y)
assert_array_equal(clf.predict(X), np.ones(len(X)))
_check_predict_proba(clf, X, y)
X = [[0], [0], [0], [0]] # ignored
y = ["two", "one", "two", "two"]
clf = DummyClassifier(strategy="constant", random_state=0, constant="one")
clf.fit(X, y)
assert_array_equal(clf.predict(X), np.array(["one"] * 4))
_check_predict_proba(clf, X, y)
def test_constant_strategy_multioutput():
X = [[0], [0], [0], [0]] # ignored
y = np.array([[2, 3], [1, 3], [2, 3], [2, 0]])
n_samples = len(X)
clf = DummyClassifier(strategy="constant", random_state=0, constant=[1, 0])
clf.fit(X, y)
assert_array_equal(
clf.predict(X), np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))])
)
_check_predict_proba(clf, X, y)
@pytest.mark.parametrize(
"y, params, err_msg",
[
([2, 1, 2, 2], {"random_state": 0}, "Constant.*has to be specified"),
([2, 1, 2, 2], {"constant": [2, 0]}, "Constant.*should have shape"),
(
np.transpose([[2, 1, 2, 2], [2, 1, 2, 2]]),
{"constant": 2},
"Constant.*should have shape",
),
(
[2, 1, 2, 2],
{"constant": "my-constant"},
"constant=my-constant.*Possible values.*\\[1, 2]",
),
(
np.transpose([[2, 1, 2, 2], [2, 1, 2, 2]]),
{"constant": [2, "unknown"]},
"constant=\\[2, 'unknown'].*Possible values.*\\[1, 2]",
),
],
ids=[
"no-constant",
"too-many-constant",
"not-enough-output",
"single-output",
"multi-output",
],
)
def test_constant_strategy_exceptions(y, params, err_msg):
X = [[0], [0], [0], [0]]
clf = DummyClassifier(strategy="constant", **params)
with pytest.raises(ValueError, match=err_msg):
clf.fit(X, y)
def test_classification_sample_weight():
X = [[0], [0], [1]]
y = [0, 1, 0]
sample_weight = [0.1, 1.0, 0.1]
clf = DummyClassifier(strategy="stratified").fit(X, y, sample_weight)
assert_array_almost_equal(clf.class_prior_, [0.2 / 1.2, 1.0 / 1.2])
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
def test_constant_strategy_sparse_target(csc_container):
X = [[0]] * 5 # ignored
y = csc_container(np.array([[0, 1], [4, 0], [1, 1], [1, 4], [1, 1]]))
n_samples = len(X)
clf = DummyClassifier(strategy="constant", random_state=0, constant=[1, 0])
clf.fit(X, y)
y_pred = clf.predict(X)
assert sp.issparse(y_pred)
assert_array_equal(
y_pred.toarray(), np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))])
)
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
def test_uniform_strategy_sparse_target_warning(global_random_seed, csc_container):
X = [[0]] * 5 # ignored
y = csc_container(np.array([[2, 1], [2, 2], [1, 4], [4, 2], [1, 1]]))
clf = DummyClassifier(strategy="uniform", random_state=global_random_seed)
with pytest.warns(UserWarning, match="the uniform strategy would not save memory"):
clf.fit(X, y)
X = [[0]] * 500
y_pred = clf.predict(X)
for k in range(y.shape[1]):
p = np.bincount(y_pred[:, k]) / float(len(X))
assert_almost_equal(p[1], 1 / 3, decimal=1)
assert_almost_equal(p[2], 1 / 3, decimal=1)
assert_almost_equal(p[4], 1 / 3, decimal=1)
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
def test_stratified_strategy_sparse_target(global_random_seed, csc_container):
X = [[0]] * 5 # ignored
y = csc_container(np.array([[4, 1], [0, 0], [1, 1], [1, 4], [1, 1]]))
clf = DummyClassifier(strategy="stratified", random_state=global_random_seed)
clf.fit(X, y)
X = [[0]] * 500
y_pred = clf.predict(X)
assert sp.issparse(y_pred)
y_pred = y_pred.toarray()
for k in range(y.shape[1]):
p = np.bincount(y_pred[:, k]) / float(len(X))
assert_almost_equal(p[1], 3.0 / 5, decimal=1)
assert_almost_equal(p[0], 1.0 / 5, decimal=1)
assert_almost_equal(p[4], 1.0 / 5, decimal=1)
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
def test_most_frequent_and_prior_strategy_sparse_target(csc_container):
X = [[0]] * 5 # ignored
y = csc_container(np.array([[1, 0], [1, 3], [4, 0], [0, 1], [1, 0]]))
n_samples = len(X)
y_expected = np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))])
for strategy in ("most_frequent", "prior"):
clf = DummyClassifier(strategy=strategy, random_state=0)
clf.fit(X, y)
y_pred = clf.predict(X)
assert sp.issparse(y_pred)
assert_array_equal(y_pred.toarray(), y_expected)
def test_dummy_regressor_sample_weight(global_random_seed, n_samples=10):
random_state = np.random.RandomState(seed=global_random_seed)
X = [[0]] * n_samples
y = random_state.rand(n_samples)
sample_weight = random_state.rand(n_samples)
est = DummyRegressor(strategy="mean").fit(X, y, sample_weight)
assert est.constant_ == np.average(y, weights=sample_weight)
est = DummyRegressor(strategy="median").fit(X, y, sample_weight)
assert est.constant_ == _weighted_percentile(y, sample_weight, 50.0)
est = DummyRegressor(strategy="quantile", quantile=0.95).fit(X, y, sample_weight)
assert est.constant_ == _weighted_percentile(y, sample_weight, 95.0)
def test_dummy_regressor_on_3D_array():
X = np.array([[["foo"]], [["bar"]], [["baz"]]])
y = np.array([2, 2, 2])
y_expected = np.array([2, 2, 2])
cls = DummyRegressor()
cls.fit(X, y)
y_pred = cls.predict(X)
assert_array_equal(y_pred, y_expected)
def test_dummy_classifier_on_3D_array():
X = np.array([[["foo"]], [["bar"]], [["baz"]]])
y = [2, 2, 2]
y_expected = [2, 2, 2]
y_proba_expected = [[1], [1], [1]]
cls = DummyClassifier(strategy="stratified")
cls.fit(X, y)
y_pred = cls.predict(X)
y_pred_proba = cls.predict_proba(X)
assert_array_equal(y_pred, y_expected)
assert_array_equal(y_pred_proba, y_proba_expected)
def test_dummy_regressor_return_std():
X = [[0]] * 3 # ignored
y = np.array([2, 2, 2])
y_std_expected = np.array([0, 0, 0])
cls = DummyRegressor()
cls.fit(X, y)
y_pred_list = cls.predict(X, return_std=True)
# there should be two elements when return_std is True
assert len(y_pred_list) == 2
# the second element should be all zeros
assert_array_equal(y_pred_list[1], y_std_expected)
@pytest.mark.parametrize(
"y,y_test",
[
([1, 1, 1, 2], [1.25] * 4),
(np.array([[2, 2], [1, 1], [1, 1], [1, 1]]), [[1.25, 1.25]] * 4),
],
)
def test_regressor_score_with_None(y, y_test):
reg = DummyRegressor()
reg.fit(None, y)
assert reg.score(None, y_test) == 1.0
@pytest.mark.parametrize("strategy", ["mean", "median", "quantile", "constant"])
def test_regressor_prediction_independent_of_X(strategy):
y = [0, 2, 1, 1]
X1 = [[0]] * 4
reg1 = DummyRegressor(strategy=strategy, constant=0, quantile=0.7)
reg1.fit(X1, y)
predictions1 = reg1.predict(X1)
X2 = [[1]] * 4
reg2 = DummyRegressor(strategy=strategy, constant=0, quantile=0.7)
reg2.fit(X2, y)
predictions2 = reg2.predict(X2)
assert_array_equal(predictions1, predictions2)
@pytest.mark.parametrize(
"strategy", ["stratified", "most_frequent", "prior", "uniform", "constant"]
)
def test_dtype_of_classifier_probas(strategy):
y = [0, 2, 1, 1]
X = np.zeros(4)
model = DummyClassifier(strategy=strategy, random_state=0, constant=0)
probas = model.fit(X, y).predict_proba(X)
assert probas.dtype == np.float64