File size: 5,232 Bytes
7885a28 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import numpy as np
import pytest
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm._bounds import l1_min_c
from sklearn.svm._newrand import bounded_rand_int_wrap, set_seed_wrap
from sklearn.utils.fixes import CSR_CONTAINERS
dense_X = [[-1, 0], [0, 1], [1, 1], [1, 1]]
Y1 = [0, 1, 1, 1]
Y2 = [2, 1, 0, 0]
@pytest.mark.parametrize("X_container", CSR_CONTAINERS + [np.array])
@pytest.mark.parametrize("loss", ["squared_hinge", "log"])
@pytest.mark.parametrize("Y_label", ["two-classes", "multi-class"])
@pytest.mark.parametrize("intercept_label", ["no-intercept", "fit-intercept"])
def test_l1_min_c(X_container, loss, Y_label, intercept_label):
Ys = {"two-classes": Y1, "multi-class": Y2}
intercepts = {
"no-intercept": {"fit_intercept": False},
"fit-intercept": {"fit_intercept": True, "intercept_scaling": 10},
}
X = X_container(dense_X)
Y = Ys[Y_label]
intercept_params = intercepts[intercept_label]
check_l1_min_c(X, Y, loss, **intercept_params)
def check_l1_min_c(X, y, loss, fit_intercept=True, intercept_scaling=1.0):
min_c = l1_min_c(
X,
y,
loss=loss,
fit_intercept=fit_intercept,
intercept_scaling=intercept_scaling,
)
clf = {
"log": LogisticRegression(penalty="l1", solver="liblinear"),
"squared_hinge": LinearSVC(loss="squared_hinge", penalty="l1", dual=False),
}[loss]
clf.fit_intercept = fit_intercept
clf.intercept_scaling = intercept_scaling
clf.C = min_c
clf.fit(X, y)
assert (np.asarray(clf.coef_) == 0).all()
assert (np.asarray(clf.intercept_) == 0).all()
clf.C = min_c * 1.01
clf.fit(X, y)
assert (np.asarray(clf.coef_) != 0).any() or (np.asarray(clf.intercept_) != 0).any()
def test_ill_posed_min_c():
X = [[0, 0], [0, 0]]
y = [0, 1]
with pytest.raises(ValueError):
l1_min_c(X, y)
_MAX_UNSIGNED_INT = 4294967295
def test_newrand_default():
"""Test that bounded_rand_int_wrap without seeding respects the range
Note this test should pass either if executed alone, or in conjunctions
with other tests that call set_seed explicit in any order: it checks
invariants on the RNG instead of specific values.
"""
generated = [bounded_rand_int_wrap(100) for _ in range(10)]
assert all(0 <= x < 100 for x in generated)
assert not all(x == generated[0] for x in generated)
@pytest.mark.parametrize("seed, expected", [(0, 54), (_MAX_UNSIGNED_INT, 9)])
def test_newrand_set_seed(seed, expected):
"""Test that `set_seed` produces deterministic results"""
set_seed_wrap(seed)
generated = bounded_rand_int_wrap(100)
assert generated == expected
@pytest.mark.parametrize("seed", [-1, _MAX_UNSIGNED_INT + 1])
def test_newrand_set_seed_overflow(seed):
"""Test that `set_seed_wrap` is defined for unsigned 32bits ints"""
with pytest.raises(OverflowError):
set_seed_wrap(seed)
@pytest.mark.parametrize("range_, n_pts", [(_MAX_UNSIGNED_INT, 10000), (100, 25)])
def test_newrand_bounded_rand_int(range_, n_pts):
"""Test that `bounded_rand_int` follows a uniform distribution"""
# XXX: this test is very seed sensitive: either it is wrong (too strict?)
# or the wrapped RNG is not uniform enough, at least on some platforms.
set_seed_wrap(42)
n_iter = 100
ks_pvals = []
uniform_dist = stats.uniform(loc=0, scale=range_)
# perform multiple samplings to make chance of outlier sampling negligible
for _ in range(n_iter):
# Deterministic random sampling
sample = [bounded_rand_int_wrap(range_) for _ in range(n_pts)]
res = stats.kstest(sample, uniform_dist.cdf)
ks_pvals.append(res.pvalue)
# Null hypothesis = samples come from an uniform distribution.
# Under the null hypothesis, p-values should be uniformly distributed
# and not concentrated on low values
# (this may seem counter-intuitive but is backed by multiple refs)
# So we can do two checks:
# (1) check uniformity of p-values
uniform_p_vals_dist = stats.uniform(loc=0, scale=1)
res_pvals = stats.kstest(ks_pvals, uniform_p_vals_dist.cdf)
assert res_pvals.pvalue > 0.05, (
"Null hypothesis rejected: generated random numbers are not uniform."
" Details: the (meta) p-value of the test of uniform distribution"
f" of p-values is {res_pvals.pvalue} which is not > 0.05"
)
# (2) (safety belt) check that 90% of p-values are above 0.05
min_10pct_pval = np.percentile(ks_pvals, q=10)
# lower 10th quantile pvalue <= 0.05 means that the test rejects the
# null hypothesis that the sample came from the uniform distribution
assert min_10pct_pval > 0.05, (
"Null hypothesis rejected: generated random numbers are not uniform. "
f"Details: lower 10th quantile p-value of {min_10pct_pval} not > 0.05."
)
@pytest.mark.parametrize("range_", [-1, _MAX_UNSIGNED_INT + 1])
def test_newrand_bounded_rand_int_limits(range_):
"""Test that `bounded_rand_int_wrap` is defined for unsigned 32bits ints"""
with pytest.raises(OverflowError):
bounded_rand_int_wrap(range_)
|