|
""" |
|
Test the pipeline module. |
|
""" |
|
|
|
import itertools |
|
import re |
|
import shutil |
|
import time |
|
import warnings |
|
from tempfile import mkdtemp |
|
|
|
import joblib |
|
import numpy as np |
|
import pytest |
|
|
|
from sklearn import config_context |
|
from sklearn.base import ( |
|
BaseEstimator, |
|
ClassifierMixin, |
|
TransformerMixin, |
|
clone, |
|
is_classifier, |
|
is_regressor, |
|
) |
|
from sklearn.cluster import KMeans |
|
from sklearn.datasets import load_iris |
|
from sklearn.decomposition import PCA, TruncatedSVD |
|
from sklearn.dummy import DummyRegressor |
|
from sklearn.ensemble import ( |
|
HistGradientBoostingClassifier, |
|
RandomForestClassifier, |
|
RandomTreesEmbedding, |
|
) |
|
from sklearn.exceptions import NotFittedError, UnsetMetadataPassedError |
|
from sklearn.feature_extraction.text import CountVectorizer |
|
from sklearn.feature_selection import SelectKBest, f_classif |
|
from sklearn.impute import SimpleImputer |
|
from sklearn.linear_model import Lasso, LinearRegression, LogisticRegression |
|
from sklearn.metrics import accuracy_score, r2_score |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.neighbors import LocalOutlierFactor |
|
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline, make_union |
|
from sklearn.preprocessing import FunctionTransformer, StandardScaler |
|
from sklearn.svm import SVC |
|
from sklearn.tests.metadata_routing_common import ( |
|
ConsumingNoFitTransformTransformer, |
|
ConsumingTransformer, |
|
_Registry, |
|
check_recorded_metadata, |
|
) |
|
from sklearn.utils import get_tags |
|
from sklearn.utils._metadata_requests import COMPOSITE_METHODS, METHODS |
|
from sklearn.utils._testing import ( |
|
MinimalClassifier, |
|
MinimalRegressor, |
|
MinimalTransformer, |
|
assert_allclose, |
|
assert_array_almost_equal, |
|
assert_array_equal, |
|
) |
|
from sklearn.utils.fixes import CSR_CONTAINERS |
|
from sklearn.utils.validation import _check_feature_names, check_is_fitted |
|
|
|
|
|
|
|
|
|
iris = load_iris() |
|
iris.data.flags.writeable = False |
|
iris.target.flags.writeable = False |
|
|
|
|
|
JUNK_FOOD_DOCS = ( |
|
"the pizza pizza beer copyright", |
|
"the pizza burger beer copyright", |
|
"the the pizza beer beer copyright", |
|
"the burger beer beer copyright", |
|
"the coke burger coke copyright", |
|
"the coke burger burger", |
|
) |
|
|
|
|
|
class NoFit(BaseEstimator): |
|
"""Small class to test parameter dispatching.""" |
|
|
|
def __init__(self, a=None, b=None): |
|
self.a = a |
|
self.b = b |
|
|
|
|
|
class NoTrans(NoFit): |
|
def fit(self, X, y=None): |
|
return self |
|
|
|
def get_params(self, deep=False): |
|
return {"a": self.a, "b": self.b} |
|
|
|
def set_params(self, **params): |
|
self.a = params["a"] |
|
return self |
|
|
|
|
|
class NoInvTransf(TransformerMixin, NoTrans): |
|
def transform(self, X): |
|
return X |
|
|
|
|
|
class Transf(NoInvTransf): |
|
def transform(self, X): |
|
return X |
|
|
|
def inverse_transform(self, X): |
|
return X |
|
|
|
|
|
class TransfFitParams(Transf): |
|
def fit(self, X, y=None, **fit_params): |
|
self.fit_params = fit_params |
|
return self |
|
|
|
|
|
class Mult(TransformerMixin, BaseEstimator): |
|
def __init__(self, mult=1): |
|
self.mult = mult |
|
|
|
def __sklearn_is_fitted__(self): |
|
return True |
|
|
|
def fit(self, X, y=None): |
|
return self |
|
|
|
def transform(self, X): |
|
return np.asarray(X) * self.mult |
|
|
|
def inverse_transform(self, X): |
|
return np.asarray(X) / self.mult |
|
|
|
def predict(self, X): |
|
return (np.asarray(X) * self.mult).sum(axis=1) |
|
|
|
predict_proba = predict_log_proba = decision_function = predict |
|
|
|
def score(self, X, y=None): |
|
return np.sum(X) |
|
|
|
|
|
class FitParamT(BaseEstimator): |
|
"""Mock classifier""" |
|
|
|
def __init__(self): |
|
self.successful = False |
|
|
|
def fit(self, X, y, should_succeed=False): |
|
self.successful = should_succeed |
|
self.fitted_ = True |
|
|
|
def predict(self, X): |
|
return self.successful |
|
|
|
def fit_predict(self, X, y, should_succeed=False): |
|
self.fit(X, y, should_succeed=should_succeed) |
|
return self.predict(X) |
|
|
|
def score(self, X, y=None, sample_weight=None): |
|
if sample_weight is not None: |
|
X = X * sample_weight |
|
return np.sum(X) |
|
|
|
|
|
class DummyTransf(Transf): |
|
"""Transformer which store the column means""" |
|
|
|
def fit(self, X, y): |
|
self.means_ = np.mean(X, axis=0) |
|
|
|
|
|
self.timestamp_ = time.time() |
|
return self |
|
|
|
|
|
class DummyEstimatorParams(BaseEstimator): |
|
"""Mock classifier that takes params on predict""" |
|
|
|
def __sklearn_is_fitted__(self): |
|
return True |
|
|
|
def fit(self, X, y): |
|
return self |
|
|
|
def predict(self, X, got_attribute=False): |
|
self.got_attribute = got_attribute |
|
return self |
|
|
|
def predict_proba(self, X, got_attribute=False): |
|
self.got_attribute = got_attribute |
|
return self |
|
|
|
def predict_log_proba(self, X, got_attribute=False): |
|
self.got_attribute = got_attribute |
|
return self |
|
|
|
|
|
def test_pipeline_invalid_parameters(): |
|
|
|
|
|
pipeline = Pipeline([(1, 1)]) |
|
with pytest.raises(TypeError): |
|
pipeline.fit([[1]], [1]) |
|
|
|
|
|
|
|
msg = ( |
|
"Last step of Pipeline should implement fit " |
|
"or be the string 'passthrough'" |
|
".*NoFit.*" |
|
) |
|
pipeline = Pipeline([("clf", NoFit())]) |
|
with pytest.raises(TypeError, match=msg): |
|
pipeline.fit([[1]], [1]) |
|
|
|
|
|
clf = NoTrans() |
|
pipe = Pipeline([("svc", clf)]) |
|
assert pipe.get_params(deep=True) == dict( |
|
svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False) |
|
) |
|
|
|
|
|
pipe.set_params(svc__a=0.1) |
|
assert clf.a == 0.1 |
|
assert clf.b is None |
|
|
|
repr(pipe) |
|
|
|
|
|
clf = SVC() |
|
filter1 = SelectKBest(f_classif) |
|
pipe = Pipeline([("anova", filter1), ("svc", clf)]) |
|
|
|
|
|
assert pipe.named_steps["anova"] is filter1 |
|
assert pipe.named_steps["svc"] is clf |
|
|
|
|
|
|
|
msg = "All intermediate steps should be transformers.*\\bNoTrans\\b.*" |
|
pipeline = Pipeline([("t", NoTrans()), ("svc", clf)]) |
|
with pytest.raises(TypeError, match=msg): |
|
pipeline.fit([[1]], [1]) |
|
|
|
|
|
pipe.set_params(svc__C=0.1) |
|
assert clf.C == 0.1 |
|
|
|
repr(pipe) |
|
|
|
|
|
msg = re.escape( |
|
"Invalid parameter 'C' for estimator SelectKBest(). Valid parameters are: ['k'," |
|
" 'score_func']." |
|
) |
|
with pytest.raises(ValueError, match=msg): |
|
pipe.set_params(anova__C=0.1) |
|
|
|
|
|
pipe2 = clone(pipe) |
|
assert pipe.named_steps["svc"] is not pipe2.named_steps["svc"] |
|
|
|
|
|
params = pipe.get_params(deep=True) |
|
params2 = pipe2.get_params(deep=True) |
|
|
|
for x in pipe.get_params(deep=False): |
|
params.pop(x) |
|
|
|
for x in pipe2.get_params(deep=False): |
|
params2.pop(x) |
|
|
|
|
|
params.pop("svc") |
|
params.pop("anova") |
|
params2.pop("svc") |
|
params2.pop("anova") |
|
assert params == params2 |
|
|
|
|
|
def test_pipeline_init_tuple(): |
|
|
|
X = np.array([[1, 2]]) |
|
pipe = Pipeline((("transf", Transf()), ("clf", FitParamT()))) |
|
pipe.fit(X, y=None) |
|
pipe.score(X) |
|
|
|
pipe.set_params(transf="passthrough") |
|
pipe.fit(X, y=None) |
|
pipe.score(X) |
|
|
|
|
|
def test_pipeline_methods_anova(): |
|
|
|
X = iris.data |
|
y = iris.target |
|
|
|
clf = LogisticRegression() |
|
filter1 = SelectKBest(f_classif, k=2) |
|
pipe = Pipeline([("anova", filter1), ("logistic", clf)]) |
|
pipe.fit(X, y) |
|
pipe.predict(X) |
|
pipe.predict_proba(X) |
|
pipe.predict_log_proba(X) |
|
pipe.score(X, y) |
|
|
|
|
|
def test_pipeline_fit_params(): |
|
|
|
pipe = Pipeline([("transf", Transf()), ("clf", FitParamT())]) |
|
pipe.fit(X=None, y=None, clf__should_succeed=True) |
|
|
|
assert pipe.predict(None) |
|
|
|
assert pipe.named_steps["transf"].a is None |
|
assert pipe.named_steps["transf"].b is None |
|
|
|
|
|
msg = re.escape("fit() got an unexpected keyword argument 'bad'") |
|
with pytest.raises(TypeError, match=msg): |
|
pipe.fit(None, None, clf__bad=True) |
|
|
|
|
|
def test_pipeline_sample_weight_supported(): |
|
|
|
X = np.array([[1, 2]]) |
|
pipe = Pipeline([("transf", Transf()), ("clf", FitParamT())]) |
|
pipe.fit(X, y=None) |
|
assert pipe.score(X) == 3 |
|
assert pipe.score(X, y=None) == 3 |
|
assert pipe.score(X, y=None, sample_weight=None) == 3 |
|
assert pipe.score(X, sample_weight=np.array([2, 3])) == 8 |
|
|
|
|
|
def test_pipeline_sample_weight_unsupported(): |
|
|
|
X = np.array([[1, 2]]) |
|
pipe = Pipeline([("transf", Transf()), ("clf", Mult())]) |
|
pipe.fit(X, y=None) |
|
assert pipe.score(X) == 3 |
|
assert pipe.score(X, sample_weight=None) == 3 |
|
|
|
msg = re.escape("score() got an unexpected keyword argument 'sample_weight'") |
|
with pytest.raises(TypeError, match=msg): |
|
pipe.score(X, sample_weight=np.array([2, 3])) |
|
|
|
|
|
def test_pipeline_raise_set_params_error(): |
|
|
|
pipe = Pipeline([("cls", LinearRegression())]) |
|
|
|
|
|
error_msg = re.escape( |
|
"Invalid parameter 'fake' for estimator Pipeline(steps=[('cls'," |
|
" LinearRegression())]). Valid parameters are: ['memory', 'steps'," |
|
" 'transform_input', 'verbose']." |
|
) |
|
with pytest.raises(ValueError, match=error_msg): |
|
pipe.set_params(fake="nope") |
|
|
|
|
|
|
|
with pytest.raises(ValueError, match=error_msg): |
|
pipe.set_params(fake__estimator="nope") |
|
|
|
|
|
error_msg = re.escape( |
|
"Invalid parameter 'invalid_param' for estimator LinearRegression(). Valid" |
|
" parameters are: ['copy_X', 'fit_intercept', 'n_jobs', 'positive']." |
|
) |
|
with pytest.raises(ValueError, match=error_msg): |
|
pipe.set_params(cls__invalid_param="nope") |
|
|
|
|
|
def test_pipeline_methods_pca_svm(): |
|
|
|
X = iris.data |
|
y = iris.target |
|
|
|
clf = SVC(probability=True, random_state=0) |
|
pca = PCA(svd_solver="full", n_components="mle", whiten=True) |
|
pipe = Pipeline([("pca", pca), ("svc", clf)]) |
|
pipe.fit(X, y) |
|
pipe.predict(X) |
|
pipe.predict_proba(X) |
|
pipe.predict_log_proba(X) |
|
pipe.score(X, y) |
|
|
|
|
|
def test_pipeline_score_samples_pca_lof(): |
|
X = iris.data |
|
|
|
|
|
|
|
pca = PCA(svd_solver="full", n_components="mle", whiten=True) |
|
lof = LocalOutlierFactor(novelty=True) |
|
pipe = Pipeline([("pca", pca), ("lof", lof)]) |
|
pipe.fit(X) |
|
|
|
assert pipe.score_samples(X).shape == (X.shape[0],) |
|
|
|
lof.fit(pca.fit_transform(X)) |
|
assert_allclose(pipe.score_samples(X), lof.score_samples(pca.transform(X))) |
|
|
|
|
|
def test_score_samples_on_pipeline_without_score_samples(): |
|
X = np.array([[1], [2]]) |
|
y = np.array([1, 2]) |
|
|
|
|
|
pipe = make_pipeline(LogisticRegression()) |
|
pipe.fit(X, y) |
|
|
|
inner_msg = "'LogisticRegression' object has no attribute 'score_samples'" |
|
outer_msg = "'Pipeline' has no attribute 'score_samples'" |
|
with pytest.raises(AttributeError, match=outer_msg) as exec_info: |
|
pipe.score_samples(X) |
|
|
|
assert isinstance(exec_info.value.__cause__, AttributeError) |
|
assert inner_msg in str(exec_info.value.__cause__) |
|
|
|
|
|
def test_pipeline_methods_preprocessing_svm(): |
|
|
|
X = iris.data |
|
y = iris.target |
|
n_samples = X.shape[0] |
|
n_classes = len(np.unique(y)) |
|
scaler = StandardScaler() |
|
pca = PCA(n_components=2, svd_solver="randomized", whiten=True) |
|
clf = SVC(probability=True, random_state=0, decision_function_shape="ovr") |
|
|
|
for preprocessing in [scaler, pca]: |
|
pipe = Pipeline([("preprocess", preprocessing), ("svc", clf)]) |
|
pipe.fit(X, y) |
|
|
|
|
|
predict = pipe.predict(X) |
|
assert predict.shape == (n_samples,) |
|
|
|
proba = pipe.predict_proba(X) |
|
assert proba.shape == (n_samples, n_classes) |
|
|
|
log_proba = pipe.predict_log_proba(X) |
|
assert log_proba.shape == (n_samples, n_classes) |
|
|
|
decision_function = pipe.decision_function(X) |
|
assert decision_function.shape == (n_samples, n_classes) |
|
|
|
pipe.score(X, y) |
|
|
|
|
|
def test_fit_predict_on_pipeline(): |
|
|
|
|
|
|
|
scaler = StandardScaler() |
|
km = KMeans(random_state=0, n_init="auto") |
|
|
|
|
|
scaler_for_pipeline = StandardScaler() |
|
km_for_pipeline = KMeans(random_state=0, n_init="auto") |
|
|
|
|
|
scaled = scaler.fit_transform(iris.data) |
|
separate_pred = km.fit_predict(scaled) |
|
|
|
|
|
pipe = Pipeline([("scaler", scaler_for_pipeline), ("Kmeans", km_for_pipeline)]) |
|
pipeline_pred = pipe.fit_predict(iris.data) |
|
|
|
assert_array_almost_equal(pipeline_pred, separate_pred) |
|
|
|
|
|
def test_fit_predict_on_pipeline_without_fit_predict(): |
|
|
|
|
|
scaler = StandardScaler() |
|
pca = PCA(svd_solver="full") |
|
pipe = Pipeline([("scaler", scaler), ("pca", pca)]) |
|
|
|
outer_msg = "'Pipeline' has no attribute 'fit_predict'" |
|
inner_msg = "'PCA' object has no attribute 'fit_predict'" |
|
with pytest.raises(AttributeError, match=outer_msg) as exec_info: |
|
getattr(pipe, "fit_predict") |
|
assert isinstance(exec_info.value.__cause__, AttributeError) |
|
assert inner_msg in str(exec_info.value.__cause__) |
|
|
|
|
|
def test_fit_predict_with_intermediate_fit_params(): |
|
|
|
|
|
pipe = Pipeline([("transf", TransfFitParams()), ("clf", FitParamT())]) |
|
pipe.fit_predict( |
|
X=None, y=None, transf__should_get_this=True, clf__should_succeed=True |
|
) |
|
assert pipe.named_steps["transf"].fit_params["should_get_this"] |
|
assert pipe.named_steps["clf"].successful |
|
assert "should_succeed" not in pipe.named_steps["transf"].fit_params |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"method_name", ["predict", "predict_proba", "predict_log_proba"] |
|
) |
|
def test_predict_methods_with_predict_params(method_name): |
|
|
|
|
|
pipe = Pipeline([("transf", Transf()), ("clf", DummyEstimatorParams())]) |
|
pipe.fit(None, None) |
|
method = getattr(pipe, method_name) |
|
method(X=None, got_attribute=True) |
|
|
|
assert pipe.named_steps["clf"].got_attribute |
|
|
|
|
|
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS) |
|
def test_feature_union(csr_container): |
|
|
|
X = iris.data.copy() |
|
X -= X.mean(axis=0) |
|
y = iris.target |
|
svd = TruncatedSVD(n_components=2, random_state=0) |
|
select = SelectKBest(k=1) |
|
fs = FeatureUnion([("svd", svd), ("select", select)]) |
|
fs.fit(X, y) |
|
X_transformed = fs.transform(X) |
|
assert X_transformed.shape == (X.shape[0], 3) |
|
|
|
|
|
assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X)) |
|
assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) |
|
|
|
|
|
|
|
fs = FeatureUnion([("svd", svd), ("select", select)]) |
|
X_sp = csr_container(X) |
|
X_sp_transformed = fs.fit_transform(X_sp, y) |
|
assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) |
|
|
|
|
|
fs2 = clone(fs) |
|
assert fs.transformer_list[0][1] is not fs2.transformer_list[0][1] |
|
|
|
|
|
fs.set_params(select__k=2) |
|
assert fs.fit_transform(X, y).shape == (X.shape[0], 4) |
|
|
|
|
|
fs = FeatureUnion([("mock", Transf()), ("svd", svd), ("select", select)]) |
|
X_transformed = fs.fit_transform(X, y) |
|
assert X_transformed.shape == (X.shape[0], 8) |
|
|
|
|
|
msg = "All estimators should implement fit and transform.*\\bNoTrans\\b" |
|
fs = FeatureUnion([("transform", Transf()), ("no_transform", NoTrans())]) |
|
with pytest.raises(TypeError, match=msg): |
|
fs.fit(X) |
|
|
|
|
|
fs = FeatureUnion((("svd", svd), ("select", select))) |
|
fs.fit(X, y) |
|
|
|
|
|
def test_feature_union_named_transformers(): |
|
"""Check the behaviour of `named_transformers` attribute.""" |
|
transf = Transf() |
|
noinvtransf = NoInvTransf() |
|
fs = FeatureUnion([("transf", transf), ("noinvtransf", noinvtransf)]) |
|
assert fs.named_transformers["transf"] == transf |
|
assert fs.named_transformers["noinvtransf"] == noinvtransf |
|
|
|
|
|
assert fs.named_transformers.transf == transf |
|
assert fs.named_transformers.noinvtransf == noinvtransf |
|
|
|
|
|
def test_make_union(): |
|
pca = PCA(svd_solver="full") |
|
mock = Transf() |
|
fu = make_union(pca, mock) |
|
names, transformers = zip(*fu.transformer_list) |
|
assert names == ("pca", "transf") |
|
assert transformers == (pca, mock) |
|
|
|
|
|
def test_make_union_kwargs(): |
|
pca = PCA(svd_solver="full") |
|
mock = Transf() |
|
fu = make_union(pca, mock, n_jobs=3) |
|
assert fu.transformer_list == make_union(pca, mock).transformer_list |
|
assert 3 == fu.n_jobs |
|
|
|
|
|
msg = re.escape( |
|
"make_union() got an unexpected keyword argument 'transformer_weights'" |
|
) |
|
with pytest.raises(TypeError, match=msg): |
|
make_union(pca, mock, transformer_weights={"pca": 10, "Transf": 1}) |
|
|
|
|
|
def test_pipeline_transform(): |
|
|
|
|
|
X = iris.data |
|
pca = PCA(n_components=2, svd_solver="full") |
|
pipeline = Pipeline([("pca", pca)]) |
|
|
|
|
|
X_trans = pipeline.fit(X).transform(X) |
|
X_trans2 = pipeline.fit_transform(X) |
|
X_trans3 = pca.fit_transform(X) |
|
assert_array_almost_equal(X_trans, X_trans2) |
|
assert_array_almost_equal(X_trans, X_trans3) |
|
|
|
X_back = pipeline.inverse_transform(X_trans) |
|
X_back2 = pca.inverse_transform(X_trans) |
|
assert_array_almost_equal(X_back, X_back2) |
|
|
|
|
|
def test_pipeline_fit_transform(): |
|
|
|
X = iris.data |
|
y = iris.target |
|
transf = Transf() |
|
pipeline = Pipeline([("mock", transf)]) |
|
|
|
|
|
X_trans = pipeline.fit_transform(X, y) |
|
X_trans2 = transf.fit(X, y).transform(X) |
|
assert_array_almost_equal(X_trans, X_trans2) |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"start, end", [(0, 1), (0, 2), (1, 2), (1, 3), (None, 1), (1, None), (None, None)] |
|
) |
|
def test_pipeline_slice(start, end): |
|
pipe = Pipeline( |
|
[("transf1", Transf()), ("transf2", Transf()), ("clf", FitParamT())], |
|
memory="123", |
|
verbose=True, |
|
) |
|
pipe_slice = pipe[start:end] |
|
|
|
assert isinstance(pipe_slice, Pipeline) |
|
|
|
assert pipe_slice.steps == pipe.steps[start:end] |
|
|
|
assert ( |
|
list(pipe_slice.named_steps.items()) |
|
== list(pipe.named_steps.items())[start:end] |
|
) |
|
|
|
pipe_params = pipe.get_params(deep=False) |
|
pipe_slice_params = pipe_slice.get_params(deep=False) |
|
del pipe_params["steps"] |
|
del pipe_slice_params["steps"] |
|
assert pipe_params == pipe_slice_params |
|
|
|
msg = "Pipeline slicing only supports a step of 1" |
|
with pytest.raises(ValueError, match=msg): |
|
pipe[start:end:-1] |
|
|
|
|
|
def test_pipeline_index(): |
|
transf = Transf() |
|
clf = FitParamT() |
|
pipe = Pipeline([("transf", transf), ("clf", clf)]) |
|
assert pipe[0] == transf |
|
assert pipe["transf"] == transf |
|
assert pipe[-1] == clf |
|
assert pipe["clf"] == clf |
|
|
|
|
|
with pytest.raises(IndexError): |
|
pipe[3] |
|
|
|
|
|
with pytest.raises(KeyError): |
|
pipe["foobar"] |
|
|
|
|
|
def test_set_pipeline_steps(): |
|
transf1 = Transf() |
|
transf2 = Transf() |
|
pipeline = Pipeline([("mock", transf1)]) |
|
assert pipeline.named_steps["mock"] is transf1 |
|
|
|
|
|
pipeline.steps = [("mock2", transf2)] |
|
assert "mock" not in pipeline.named_steps |
|
assert pipeline.named_steps["mock2"] is transf2 |
|
assert [("mock2", transf2)] == pipeline.steps |
|
|
|
|
|
pipeline.set_params(steps=[("mock", transf1)]) |
|
assert [("mock", transf1)] == pipeline.steps |
|
|
|
|
|
pipeline.set_params(mock=transf2) |
|
assert [("mock", transf2)] == pipeline.steps |
|
|
|
|
|
pipeline.set_params(steps=[("junk", ())]) |
|
msg = re.escape( |
|
"Last step of Pipeline should implement fit or be the string 'passthrough'." |
|
) |
|
with pytest.raises(TypeError, match=msg): |
|
pipeline.fit([[1]], [1]) |
|
|
|
msg = "This 'Pipeline' has no attribute 'fit_transform'" |
|
with pytest.raises(AttributeError, match=msg): |
|
pipeline.fit_transform([[1]], [1]) |
|
|
|
|
|
def test_pipeline_named_steps(): |
|
transf = Transf() |
|
mult2 = Mult(mult=2) |
|
pipeline = Pipeline([("mock", transf), ("mult", mult2)]) |
|
|
|
|
|
assert "mock" in pipeline.named_steps |
|
assert "mock2" not in pipeline.named_steps |
|
assert pipeline.named_steps.mock is transf |
|
assert pipeline.named_steps.mult is mult2 |
|
|
|
|
|
pipeline = Pipeline([("values", transf), ("mult", mult2)]) |
|
assert pipeline.named_steps.values is not transf |
|
assert pipeline.named_steps.mult is mult2 |
|
|
|
|
|
@pytest.mark.parametrize("passthrough", [None, "passthrough"]) |
|
def test_pipeline_correctly_adjusts_steps(passthrough): |
|
X = np.array([[1]]) |
|
y = np.array([1]) |
|
mult2 = Mult(mult=2) |
|
mult3 = Mult(mult=3) |
|
mult5 = Mult(mult=5) |
|
|
|
pipeline = Pipeline( |
|
[("m2", mult2), ("bad", passthrough), ("m3", mult3), ("m5", mult5)] |
|
) |
|
|
|
pipeline.fit(X, y) |
|
expected_names = ["m2", "bad", "m3", "m5"] |
|
actual_names = [name for name, _ in pipeline.steps] |
|
assert expected_names == actual_names |
|
|
|
|
|
@pytest.mark.parametrize("passthrough", [None, "passthrough"]) |
|
def test_set_pipeline_step_passthrough(passthrough): |
|
X = np.array([[1]]) |
|
y = np.array([1]) |
|
mult2 = Mult(mult=2) |
|
mult3 = Mult(mult=3) |
|
mult5 = Mult(mult=5) |
|
|
|
def make(): |
|
return Pipeline([("m2", mult2), ("m3", mult3), ("last", mult5)]) |
|
|
|
pipeline = make() |
|
|
|
exp = 2 * 3 * 5 |
|
assert_array_equal([[exp]], pipeline.fit_transform(X, y)) |
|
assert_array_equal([exp], pipeline.fit(X).predict(X)) |
|
assert_array_equal(X, pipeline.inverse_transform([[exp]])) |
|
|
|
pipeline.set_params(m3=passthrough) |
|
exp = 2 * 5 |
|
assert_array_equal([[exp]], pipeline.fit_transform(X, y)) |
|
assert_array_equal([exp], pipeline.fit(X).predict(X)) |
|
assert_array_equal(X, pipeline.inverse_transform([[exp]])) |
|
assert pipeline.get_params(deep=True) == { |
|
"steps": pipeline.steps, |
|
"m2": mult2, |
|
"m3": passthrough, |
|
"last": mult5, |
|
"memory": None, |
|
"m2__mult": 2, |
|
"last__mult": 5, |
|
"transform_input": None, |
|
"verbose": False, |
|
} |
|
|
|
pipeline.set_params(m2=passthrough) |
|
exp = 5 |
|
assert_array_equal([[exp]], pipeline.fit_transform(X, y)) |
|
assert_array_equal([exp], pipeline.fit(X).predict(X)) |
|
assert_array_equal(X, pipeline.inverse_transform([[exp]])) |
|
|
|
|
|
other_methods = [ |
|
"predict_proba", |
|
"predict_log_proba", |
|
"decision_function", |
|
"transform", |
|
"score", |
|
] |
|
for method in other_methods: |
|
getattr(pipeline, method)(X) |
|
|
|
pipeline.set_params(m2=mult2) |
|
exp = 2 * 5 |
|
assert_array_equal([[exp]], pipeline.fit_transform(X, y)) |
|
assert_array_equal([exp], pipeline.fit(X).predict(X)) |
|
assert_array_equal(X, pipeline.inverse_transform([[exp]])) |
|
|
|
pipeline = make() |
|
pipeline.set_params(last=passthrough) |
|
|
|
exp = 6 |
|
assert_array_equal([[exp]], pipeline.fit(X, y).transform(X)) |
|
assert_array_equal([[exp]], pipeline.fit_transform(X, y)) |
|
assert_array_equal(X, pipeline.inverse_transform([[exp]])) |
|
|
|
inner_msg = "'str' object has no attribute 'predict'" |
|
outer_msg = "This 'Pipeline' has no attribute 'predict'" |
|
with pytest.raises(AttributeError, match=outer_msg) as exec_info: |
|
getattr(pipeline, "predict") |
|
assert isinstance(exec_info.value.__cause__, AttributeError) |
|
assert inner_msg in str(exec_info.value.__cause__) |
|
|
|
|
|
exp = 2 * 5 |
|
pipeline = Pipeline([("m2", mult2), ("m3", passthrough), ("last", mult5)]) |
|
assert_array_equal([[exp]], pipeline.fit_transform(X, y)) |
|
assert_array_equal([exp], pipeline.fit(X).predict(X)) |
|
assert_array_equal(X, pipeline.inverse_transform([[exp]])) |
|
|
|
|
|
def test_pipeline_ducktyping(): |
|
pipeline = make_pipeline(Mult(5)) |
|
pipeline.predict |
|
pipeline.transform |
|
pipeline.inverse_transform |
|
|
|
pipeline = make_pipeline(Transf()) |
|
assert not hasattr(pipeline, "predict") |
|
pipeline.transform |
|
pipeline.inverse_transform |
|
|
|
pipeline = make_pipeline("passthrough") |
|
assert pipeline.steps[0] == ("passthrough", "passthrough") |
|
assert not hasattr(pipeline, "predict") |
|
pipeline.transform |
|
pipeline.inverse_transform |
|
|
|
pipeline = make_pipeline(Transf(), NoInvTransf()) |
|
assert not hasattr(pipeline, "predict") |
|
pipeline.transform |
|
assert not hasattr(pipeline, "inverse_transform") |
|
|
|
pipeline = make_pipeline(NoInvTransf(), Transf()) |
|
assert not hasattr(pipeline, "predict") |
|
pipeline.transform |
|
assert not hasattr(pipeline, "inverse_transform") |
|
|
|
|
|
def test_make_pipeline(): |
|
t1 = Transf() |
|
t2 = Transf() |
|
pipe = make_pipeline(t1, t2) |
|
assert isinstance(pipe, Pipeline) |
|
assert pipe.steps[0][0] == "transf-1" |
|
assert pipe.steps[1][0] == "transf-2" |
|
|
|
pipe = make_pipeline(t1, t2, FitParamT()) |
|
assert isinstance(pipe, Pipeline) |
|
assert pipe.steps[0][0] == "transf-1" |
|
assert pipe.steps[1][0] == "transf-2" |
|
assert pipe.steps[2][0] == "fitparamt" |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"pipeline, check_estimator_type", |
|
[ |
|
(make_pipeline(StandardScaler(), LogisticRegression()), is_classifier), |
|
(make_pipeline(StandardScaler(), LinearRegression()), is_regressor), |
|
( |
|
make_pipeline(StandardScaler()), |
|
lambda est: get_tags(est).estimator_type is None, |
|
), |
|
(Pipeline([]), lambda est: est._estimator_type is None), |
|
], |
|
) |
|
def test_pipeline_estimator_type(pipeline, check_estimator_type): |
|
"""Check that the estimator type returned by the pipeline is correct. |
|
|
|
Non-regression test as part of: |
|
https://github.com/scikit-learn/scikit-learn/issues/30197 |
|
""" |
|
|
|
repr(pipeline) |
|
assert check_estimator_type(pipeline) |
|
|
|
|
|
def test_sklearn_tags_with_empty_pipeline(): |
|
"""Check that we propagate properly the tags in a Pipeline. |
|
|
|
Non-regression test as part of: |
|
https://github.com/scikit-learn/scikit-learn/issues/30197 |
|
""" |
|
empty_pipeline = Pipeline(steps=[]) |
|
be = BaseEstimator() |
|
|
|
expected_tags = be.__sklearn_tags__() |
|
assert empty_pipeline.__sklearn_tags__() == expected_tags |
|
|
|
|
|
def test_feature_union_weights(): |
|
|
|
X = iris.data |
|
y = iris.target |
|
pca = PCA(n_components=2, svd_solver="randomized", random_state=0) |
|
select = SelectKBest(k=1) |
|
|
|
fs = FeatureUnion( |
|
[("pca", pca), ("select", select)], transformer_weights={"pca": 10} |
|
) |
|
fs.fit(X, y) |
|
X_transformed = fs.transform(X) |
|
|
|
fs = FeatureUnion( |
|
[("pca", pca), ("select", select)], transformer_weights={"pca": 10} |
|
) |
|
X_fit_transformed = fs.fit_transform(X, y) |
|
|
|
fs = FeatureUnion( |
|
[("mock", Transf()), ("pca", pca), ("select", select)], |
|
transformer_weights={"mock": 10}, |
|
) |
|
X_fit_transformed_wo_method = fs.fit_transform(X, y) |
|
|
|
|
|
|
|
assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X)) |
|
assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) |
|
assert_array_almost_equal(X_fit_transformed[:, :-1], 10 * pca.fit_transform(X)) |
|
assert_array_equal(X_fit_transformed[:, -1], select.fit_transform(X, y).ravel()) |
|
assert X_fit_transformed_wo_method.shape == (X.shape[0], 7) |
|
|
|
|
|
def test_feature_union_parallel(): |
|
|
|
X = JUNK_FOOD_DOCS |
|
|
|
fs = FeatureUnion( |
|
[ |
|
("words", CountVectorizer(analyzer="word")), |
|
("chars", CountVectorizer(analyzer="char")), |
|
] |
|
) |
|
|
|
fs_parallel = FeatureUnion( |
|
[ |
|
("words", CountVectorizer(analyzer="word")), |
|
("chars", CountVectorizer(analyzer="char")), |
|
], |
|
n_jobs=2, |
|
) |
|
|
|
fs_parallel2 = FeatureUnion( |
|
[ |
|
("words", CountVectorizer(analyzer="word")), |
|
("chars", CountVectorizer(analyzer="char")), |
|
], |
|
n_jobs=2, |
|
) |
|
|
|
fs.fit(X) |
|
X_transformed = fs.transform(X) |
|
assert X_transformed.shape[0] == len(X) |
|
|
|
fs_parallel.fit(X) |
|
X_transformed_parallel = fs_parallel.transform(X) |
|
assert X_transformed.shape == X_transformed_parallel.shape |
|
assert_array_equal(X_transformed.toarray(), X_transformed_parallel.toarray()) |
|
|
|
|
|
X_transformed_parallel2 = fs_parallel2.fit_transform(X) |
|
assert_array_equal(X_transformed.toarray(), X_transformed_parallel2.toarray()) |
|
|
|
|
|
X_transformed_parallel2 = fs_parallel2.transform(X) |
|
assert_array_equal(X_transformed.toarray(), X_transformed_parallel2.toarray()) |
|
|
|
|
|
def test_feature_union_feature_names(): |
|
word_vect = CountVectorizer(analyzer="word") |
|
char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) |
|
ft = FeatureUnion([("chars", char_vect), ("words", word_vect)]) |
|
ft.fit(JUNK_FOOD_DOCS) |
|
feature_names = ft.get_feature_names_out() |
|
for feat in feature_names: |
|
assert "chars__" in feat or "words__" in feat |
|
assert len(feature_names) == 35 |
|
|
|
ft = FeatureUnion([("tr1", Transf())]).fit([[1]]) |
|
|
|
msg = re.escape( |
|
"Transformer tr1 (type Transf) does not provide get_feature_names_out" |
|
) |
|
with pytest.raises(AttributeError, match=msg): |
|
ft.get_feature_names_out() |
|
|
|
|
|
def test_classes_property(): |
|
X = iris.data |
|
y = iris.target |
|
|
|
reg = make_pipeline(SelectKBest(k=1), LinearRegression()) |
|
reg.fit(X, y) |
|
with pytest.raises(AttributeError): |
|
getattr(reg, "classes_") |
|
|
|
clf = make_pipeline(SelectKBest(k=1), LogisticRegression(random_state=0)) |
|
with pytest.raises(AttributeError): |
|
getattr(clf, "classes_") |
|
clf.fit(X, y) |
|
assert_array_equal(clf.classes_, np.unique(y)) |
|
|
|
|
|
def test_set_feature_union_steps(): |
|
mult2 = Mult(2) |
|
mult3 = Mult(3) |
|
mult5 = Mult(5) |
|
|
|
mult3.get_feature_names_out = lambda input_features: ["x3"] |
|
mult2.get_feature_names_out = lambda input_features: ["x2"] |
|
mult5.get_feature_names_out = lambda input_features: ["x5"] |
|
|
|
ft = FeatureUnion([("m2", mult2), ("m3", mult3)]) |
|
assert_array_equal([[2, 3]], ft.transform(np.asarray([[1]]))) |
|
assert_array_equal(["m2__x2", "m3__x3"], ft.get_feature_names_out()) |
|
|
|
|
|
ft.transformer_list = [("m5", mult5)] |
|
assert_array_equal([[5]], ft.transform(np.asarray([[1]]))) |
|
assert_array_equal(["m5__x5"], ft.get_feature_names_out()) |
|
|
|
|
|
ft.set_params(transformer_list=[("mock", mult3)]) |
|
assert_array_equal([[3]], ft.transform(np.asarray([[1]]))) |
|
assert_array_equal(["mock__x3"], ft.get_feature_names_out()) |
|
|
|
|
|
ft.set_params(mock=mult5) |
|
assert_array_equal([[5]], ft.transform(np.asarray([[1]]))) |
|
assert_array_equal(["mock__x5"], ft.get_feature_names_out()) |
|
|
|
|
|
def test_set_feature_union_step_drop(): |
|
mult2 = Mult(2) |
|
mult3 = Mult(3) |
|
|
|
mult2.get_feature_names_out = lambda input_features: ["x2"] |
|
mult3.get_feature_names_out = lambda input_features: ["x3"] |
|
|
|
X = np.asarray([[1]]) |
|
|
|
ft = FeatureUnion([("m2", mult2), ("m3", mult3)]) |
|
assert_array_equal([[2, 3]], ft.fit(X).transform(X)) |
|
assert_array_equal([[2, 3]], ft.fit_transform(X)) |
|
assert_array_equal(["m2__x2", "m3__x3"], ft.get_feature_names_out()) |
|
|
|
ft.set_params(m2="drop") |
|
assert_array_equal([[3]], ft.fit(X).transform(X)) |
|
assert_array_equal([[3]], ft.fit_transform(X)) |
|
assert_array_equal(["m3__x3"], ft.get_feature_names_out()) |
|
|
|
ft.set_params(m3="drop") |
|
assert_array_equal([[]], ft.fit(X).transform(X)) |
|
assert_array_equal([[]], ft.fit_transform(X)) |
|
assert_array_equal([], ft.get_feature_names_out()) |
|
|
|
|
|
ft.set_params(m3=mult3) |
|
assert_array_equal([[3]], ft.fit(X).transform(X)) |
|
|
|
|
|
ft = FeatureUnion([("m2", "drop"), ("m3", mult3)]) |
|
assert_array_equal([[3]], ft.fit(X).transform(X)) |
|
assert_array_equal([[3]], ft.fit_transform(X)) |
|
assert_array_equal(["m3__x3"], ft.get_feature_names_out()) |
|
|
|
|
|
def test_set_feature_union_passthrough(): |
|
"""Check the behaviour of setting a transformer to `"passthrough"`.""" |
|
mult2 = Mult(2) |
|
mult3 = Mult(3) |
|
|
|
|
|
|
|
mult2.get_feature_names_out = lambda input_features: ["x2"] |
|
mult3.get_feature_names_out = lambda input_features: ["x3"] |
|
|
|
X = np.asarray([[1]]) |
|
|
|
ft = FeatureUnion([("m2", mult2), ("m3", mult3)]) |
|
assert_array_equal([[2, 3]], ft.fit(X).transform(X)) |
|
assert_array_equal([[2, 3]], ft.fit_transform(X)) |
|
assert_array_equal(["m2__x2", "m3__x3"], ft.get_feature_names_out()) |
|
|
|
ft.set_params(m2="passthrough") |
|
assert_array_equal([[1, 3]], ft.fit(X).transform(X)) |
|
assert_array_equal([[1, 3]], ft.fit_transform(X)) |
|
assert_array_equal(["m2__myfeat", "m3__x3"], ft.get_feature_names_out(["myfeat"])) |
|
|
|
ft.set_params(m3="passthrough") |
|
assert_array_equal([[1, 1]], ft.fit(X).transform(X)) |
|
assert_array_equal([[1, 1]], ft.fit_transform(X)) |
|
assert_array_equal( |
|
["m2__myfeat", "m3__myfeat"], ft.get_feature_names_out(["myfeat"]) |
|
) |
|
|
|
|
|
ft.set_params(m3=mult3) |
|
assert_array_equal([[1, 3]], ft.fit(X).transform(X)) |
|
assert_array_equal([[1, 3]], ft.fit_transform(X)) |
|
assert_array_equal(["m2__myfeat", "m3__x3"], ft.get_feature_names_out(["myfeat"])) |
|
|
|
|
|
ft = FeatureUnion([("m2", "passthrough"), ("m3", mult3)]) |
|
assert_array_equal([[1, 3]], ft.fit(X).transform(X)) |
|
assert_array_equal([[1, 3]], ft.fit_transform(X)) |
|
assert_array_equal(["m2__myfeat", "m3__x3"], ft.get_feature_names_out(["myfeat"])) |
|
|
|
X = iris.data |
|
columns = X.shape[1] |
|
pca = PCA(n_components=2, svd_solver="randomized", random_state=0) |
|
|
|
ft = FeatureUnion([("passthrough", "passthrough"), ("pca", pca)]) |
|
assert_array_equal(X, ft.fit(X).transform(X)[:, :columns]) |
|
assert_array_equal(X, ft.fit_transform(X)[:, :columns]) |
|
assert_array_equal( |
|
[ |
|
"passthrough__f0", |
|
"passthrough__f1", |
|
"passthrough__f2", |
|
"passthrough__f3", |
|
"pca__pca0", |
|
"pca__pca1", |
|
], |
|
ft.get_feature_names_out(["f0", "f1", "f2", "f3"]), |
|
) |
|
|
|
ft.set_params(pca="passthrough") |
|
X_ft = ft.fit(X).transform(X) |
|
assert_array_equal(X_ft, np.hstack([X, X])) |
|
X_ft = ft.fit_transform(X) |
|
assert_array_equal(X_ft, np.hstack([X, X])) |
|
assert_array_equal( |
|
[ |
|
"passthrough__f0", |
|
"passthrough__f1", |
|
"passthrough__f2", |
|
"passthrough__f3", |
|
"pca__f0", |
|
"pca__f1", |
|
"pca__f2", |
|
"pca__f3", |
|
], |
|
ft.get_feature_names_out(["f0", "f1", "f2", "f3"]), |
|
) |
|
|
|
ft.set_params(passthrough=pca) |
|
assert_array_equal(X, ft.fit(X).transform(X)[:, -columns:]) |
|
assert_array_equal(X, ft.fit_transform(X)[:, -columns:]) |
|
assert_array_equal( |
|
[ |
|
"passthrough__pca0", |
|
"passthrough__pca1", |
|
"pca__f0", |
|
"pca__f1", |
|
"pca__f2", |
|
"pca__f3", |
|
], |
|
ft.get_feature_names_out(["f0", "f1", "f2", "f3"]), |
|
) |
|
|
|
ft = FeatureUnion( |
|
[("passthrough", "passthrough"), ("pca", pca)], |
|
transformer_weights={"passthrough": 2}, |
|
) |
|
assert_array_equal(X * 2, ft.fit(X).transform(X)[:, :columns]) |
|
assert_array_equal(X * 2, ft.fit_transform(X)[:, :columns]) |
|
assert_array_equal( |
|
[ |
|
"passthrough__f0", |
|
"passthrough__f1", |
|
"passthrough__f2", |
|
"passthrough__f3", |
|
"pca__pca0", |
|
"pca__pca1", |
|
], |
|
ft.get_feature_names_out(["f0", "f1", "f2", "f3"]), |
|
) |
|
|
|
|
|
def test_feature_union_passthrough_get_feature_names_out_true(): |
|
"""Check feature_names_out for verbose_feature_names_out=True (default)""" |
|
X = iris.data |
|
pca = PCA(n_components=2, svd_solver="randomized", random_state=0) |
|
|
|
ft = FeatureUnion([("pca", pca), ("passthrough", "passthrough")]) |
|
ft.fit(X) |
|
assert_array_equal( |
|
[ |
|
"pca__pca0", |
|
"pca__pca1", |
|
"passthrough__x0", |
|
"passthrough__x1", |
|
"passthrough__x2", |
|
"passthrough__x3", |
|
], |
|
ft.get_feature_names_out(), |
|
) |
|
|
|
|
|
def test_feature_union_passthrough_get_feature_names_out_false(): |
|
"""Check feature_names_out for verbose_feature_names_out=False""" |
|
X = iris.data |
|
pca = PCA(n_components=2, svd_solver="randomized", random_state=0) |
|
|
|
ft = FeatureUnion( |
|
[("pca", pca), ("passthrough", "passthrough")], verbose_feature_names_out=False |
|
) |
|
ft.fit(X) |
|
assert_array_equal( |
|
[ |
|
"pca0", |
|
"pca1", |
|
"x0", |
|
"x1", |
|
"x2", |
|
"x3", |
|
], |
|
ft.get_feature_names_out(), |
|
) |
|
|
|
|
|
def test_feature_union_passthrough_get_feature_names_out_false_errors(): |
|
"""Check get_feature_names_out and non-verbose names and colliding names.""" |
|
pd = pytest.importorskip("pandas") |
|
X = pd.DataFrame([[1, 2], [2, 3]], columns=["a", "b"]) |
|
|
|
select_a = FunctionTransformer( |
|
lambda X: X[["a"]], feature_names_out=lambda self, _: np.asarray(["a"]) |
|
) |
|
union = FeatureUnion( |
|
[("t1", StandardScaler()), ("t2", select_a)], |
|
verbose_feature_names_out=False, |
|
) |
|
union.fit(X) |
|
|
|
msg = re.escape( |
|
"Output feature names: ['a'] are not unique. " |
|
"Please set verbose_feature_names_out=True to add prefixes to feature names" |
|
) |
|
|
|
with pytest.raises(ValueError, match=msg): |
|
union.get_feature_names_out() |
|
|
|
|
|
def test_feature_union_passthrough_get_feature_names_out_false_errors_overlap_over_5(): |
|
"""Check get_feature_names_out with non-verbose names and >= 5 colliding names.""" |
|
pd = pytest.importorskip("pandas") |
|
X = pd.DataFrame([list(range(10))], columns=[f"f{i}" for i in range(10)]) |
|
|
|
union = FeatureUnion( |
|
[("t1", "passthrough"), ("t2", "passthrough")], |
|
verbose_feature_names_out=False, |
|
) |
|
|
|
union.fit(X) |
|
|
|
msg = re.escape( |
|
"Output feature names: ['f0', 'f1', 'f2', 'f3', 'f4', ...] " |
|
"are not unique. Please set verbose_feature_names_out=True to add prefixes to" |
|
" feature names" |
|
) |
|
|
|
with pytest.raises(ValueError, match=msg): |
|
union.get_feature_names_out() |
|
|
|
|
|
def test_step_name_validation(): |
|
error_message_1 = r"Estimator names must not contain __: got \['a__q'\]" |
|
error_message_2 = r"Names provided are not unique: \['a', 'a'\]" |
|
error_message_3 = r"Estimator names conflict with constructor arguments: \['%s'\]" |
|
bad_steps1 = [("a__q", Mult(2)), ("b", Mult(3))] |
|
bad_steps2 = [("a", Mult(2)), ("a", Mult(3))] |
|
for cls, param in [(Pipeline, "steps"), (FeatureUnion, "transformer_list")]: |
|
|
|
bad_steps3 = [("a", Mult(2)), (param, Mult(3))] |
|
for bad_steps, message in [ |
|
(bad_steps1, error_message_1), |
|
(bad_steps2, error_message_2), |
|
(bad_steps3, error_message_3 % param), |
|
]: |
|
|
|
|
|
with pytest.raises(ValueError, match=message): |
|
cls(**{param: bad_steps}).fit([[1]], [1]) |
|
|
|
|
|
est = cls(**{param: [("a", Mult(1))]}) |
|
setattr(est, param, bad_steps) |
|
with pytest.raises(ValueError, match=message): |
|
est.fit([[1]], [1]) |
|
|
|
with pytest.raises(ValueError, match=message): |
|
est.fit_transform([[1]], [1]) |
|
|
|
|
|
est = cls(**{param: [("a", Mult(1))]}) |
|
est.set_params(**{param: bad_steps}) |
|
with pytest.raises(ValueError, match=message): |
|
est.fit([[1]], [1]) |
|
|
|
with pytest.raises(ValueError, match=message): |
|
est.fit_transform([[1]], [1]) |
|
|
|
|
|
def test_set_params_nested_pipeline(): |
|
estimator = Pipeline([("a", Pipeline([("b", DummyRegressor())]))]) |
|
estimator.set_params(a__b__alpha=0.001, a__b=Lasso()) |
|
estimator.set_params(a__steps=[("b", LogisticRegression())], a__b__C=5) |
|
|
|
|
|
def test_pipeline_memory(): |
|
X = iris.data |
|
y = iris.target |
|
cachedir = mkdtemp() |
|
try: |
|
memory = joblib.Memory(location=cachedir, verbose=10) |
|
|
|
clf = SVC(probability=True, random_state=0) |
|
transf = DummyTransf() |
|
pipe = Pipeline([("transf", clone(transf)), ("svc", clf)]) |
|
cached_pipe = Pipeline([("transf", transf), ("svc", clf)], memory=memory) |
|
|
|
|
|
cached_pipe.fit(X, y) |
|
pipe.fit(X, y) |
|
|
|
ts = cached_pipe.named_steps["transf"].timestamp_ |
|
|
|
assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) |
|
assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) |
|
assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) |
|
assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) |
|
assert_array_equal( |
|
pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_ |
|
) |
|
assert not hasattr(transf, "means_") |
|
|
|
|
|
cached_pipe.fit(X, y) |
|
|
|
assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) |
|
assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) |
|
assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) |
|
assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) |
|
assert_array_equal( |
|
pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_ |
|
) |
|
assert ts == cached_pipe.named_steps["transf"].timestamp_ |
|
|
|
|
|
clf_2 = SVC(probability=True, random_state=0) |
|
transf_2 = DummyTransf() |
|
cached_pipe_2 = Pipeline( |
|
[("transf_2", transf_2), ("svc", clf_2)], memory=memory |
|
) |
|
cached_pipe_2.fit(X, y) |
|
|
|
|
|
assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) |
|
assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) |
|
assert_array_equal( |
|
pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X) |
|
) |
|
assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) |
|
assert_array_equal( |
|
pipe.named_steps["transf"].means_, |
|
cached_pipe_2.named_steps["transf_2"].means_, |
|
) |
|
assert ts == cached_pipe_2.named_steps["transf_2"].timestamp_ |
|
finally: |
|
shutil.rmtree(cachedir) |
|
|
|
|
|
def test_make_pipeline_memory(): |
|
cachedir = mkdtemp() |
|
memory = joblib.Memory(location=cachedir, verbose=10) |
|
pipeline = make_pipeline(DummyTransf(), SVC(), memory=memory) |
|
assert pipeline.memory is memory |
|
pipeline = make_pipeline(DummyTransf(), SVC()) |
|
assert pipeline.memory is None |
|
assert len(pipeline) == 2 |
|
|
|
shutil.rmtree(cachedir) |
|
|
|
|
|
class FeatureNameSaver(BaseEstimator): |
|
def fit(self, X, y=None): |
|
_check_feature_names(self, X, reset=True) |
|
return self |
|
|
|
def transform(self, X, y=None): |
|
return X |
|
|
|
def get_feature_names_out(self, input_features=None): |
|
return input_features |
|
|
|
|
|
def test_features_names_passthrough(): |
|
"""Check pipeline.get_feature_names_out with passthrough""" |
|
pipe = Pipeline( |
|
steps=[ |
|
("names", FeatureNameSaver()), |
|
("pass", "passthrough"), |
|
("clf", LogisticRegression()), |
|
] |
|
) |
|
iris = load_iris() |
|
pipe.fit(iris.data, iris.target) |
|
assert_array_equal( |
|
pipe[:-1].get_feature_names_out(iris.feature_names), iris.feature_names |
|
) |
|
|
|
|
|
def test_feature_names_count_vectorizer(): |
|
"""Check pipeline.get_feature_names_out with vectorizers""" |
|
pipe = Pipeline(steps=[("vect", CountVectorizer()), ("clf", LogisticRegression())]) |
|
y = ["pizza" in x for x in JUNK_FOOD_DOCS] |
|
pipe.fit(JUNK_FOOD_DOCS, y) |
|
assert_array_equal( |
|
pipe[:-1].get_feature_names_out(), |
|
["beer", "burger", "coke", "copyright", "pizza", "the"], |
|
) |
|
assert_array_equal( |
|
pipe[:-1].get_feature_names_out("nonsense_is_ignored"), |
|
["beer", "burger", "coke", "copyright", "pizza", "the"], |
|
) |
|
|
|
|
|
def test_pipeline_feature_names_out_error_without_definition(): |
|
"""Check that error is raised when a transformer does not define |
|
`get_feature_names_out`.""" |
|
pipe = Pipeline(steps=[("notrans", NoTrans())]) |
|
iris = load_iris() |
|
pipe.fit(iris.data, iris.target) |
|
|
|
msg = "does not provide get_feature_names_out" |
|
with pytest.raises(AttributeError, match=msg): |
|
pipe.get_feature_names_out() |
|
|
|
|
|
def test_pipeline_param_error(): |
|
clf = make_pipeline(LogisticRegression()) |
|
with pytest.raises( |
|
ValueError, match="Pipeline.fit does not accept the sample_weight parameter" |
|
): |
|
clf.fit([[0], [0]], [0, 1], sample_weight=[1, 1]) |
|
|
|
|
|
parameter_grid_test_verbose = ( |
|
(est, pattern, method) |
|
for (est, pattern), method in itertools.product( |
|
[ |
|
( |
|
Pipeline([("transf", Transf()), ("clf", FitParamT())]), |
|
r"\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n" |
|
r"\[Pipeline\].*\(step 2 of 2\) Processing clf.* total=.*\n$", |
|
), |
|
( |
|
Pipeline([("transf", Transf()), ("noop", None), ("clf", FitParamT())]), |
|
r"\[Pipeline\].*\(step 1 of 3\) Processing transf.* total=.*\n" |
|
r"\[Pipeline\].*\(step 2 of 3\) Processing noop.* total=.*\n" |
|
r"\[Pipeline\].*\(step 3 of 3\) Processing clf.* total=.*\n$", |
|
), |
|
( |
|
Pipeline( |
|
[ |
|
("transf", Transf()), |
|
("noop", "passthrough"), |
|
("clf", FitParamT()), |
|
] |
|
), |
|
r"\[Pipeline\].*\(step 1 of 3\) Processing transf.* total=.*\n" |
|
r"\[Pipeline\].*\(step 2 of 3\) Processing noop.* total=.*\n" |
|
r"\[Pipeline\].*\(step 3 of 3\) Processing clf.* total=.*\n$", |
|
), |
|
( |
|
Pipeline([("transf", Transf()), ("clf", None)]), |
|
r"\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n" |
|
r"\[Pipeline\].*\(step 2 of 2\) Processing clf.* total=.*\n$", |
|
), |
|
( |
|
Pipeline([("transf", None), ("mult", Mult())]), |
|
r"\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n" |
|
r"\[Pipeline\].*\(step 2 of 2\) Processing mult.* total=.*\n$", |
|
), |
|
( |
|
Pipeline([("transf", "passthrough"), ("mult", Mult())]), |
|
r"\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n" |
|
r"\[Pipeline\].*\(step 2 of 2\) Processing mult.* total=.*\n$", |
|
), |
|
( |
|
FeatureUnion([("mult1", Mult()), ("mult2", Mult())]), |
|
r"\[FeatureUnion\].*\(step 1 of 2\) Processing mult1.* total=.*\n" |
|
r"\[FeatureUnion\].*\(step 2 of 2\) Processing mult2.* total=.*\n$", |
|
), |
|
( |
|
FeatureUnion([("mult1", "drop"), ("mult2", Mult()), ("mult3", "drop")]), |
|
r"\[FeatureUnion\].*\(step 1 of 1\) Processing mult2.* total=.*\n$", |
|
), |
|
], |
|
["fit", "fit_transform", "fit_predict"], |
|
) |
|
if hasattr(est, method) |
|
and not ( |
|
method == "fit_transform" |
|
and hasattr(est, "steps") |
|
and isinstance(est.steps[-1][1], FitParamT) |
|
) |
|
) |
|
|
|
|
|
@pytest.mark.parametrize("est, pattern, method", parameter_grid_test_verbose) |
|
def test_verbose(est, method, pattern, capsys): |
|
func = getattr(est, method) |
|
|
|
X = [[1, 2, 3], [4, 5, 6]] |
|
y = [[7], [8]] |
|
|
|
est.set_params(verbose=False) |
|
func(X, y) |
|
assert not capsys.readouterr().out, "Got output for verbose=False" |
|
|
|
est.set_params(verbose=True) |
|
func(X, y) |
|
assert re.match(pattern, capsys.readouterr().out) |
|
|
|
|
|
def test_n_features_in_pipeline(): |
|
|
|
|
|
X = [[1, 2], [3, 4], [5, 6]] |
|
y = [0, 1, 2] |
|
|
|
ss = StandardScaler() |
|
gbdt = HistGradientBoostingClassifier() |
|
pipe = make_pipeline(ss, gbdt) |
|
assert not hasattr(pipe, "n_features_in_") |
|
pipe.fit(X, y) |
|
assert pipe.n_features_in_ == ss.n_features_in_ == 2 |
|
|
|
|
|
|
|
ss = StandardScaler() |
|
gbdt = HistGradientBoostingClassifier() |
|
pipe = make_pipeline(ss, gbdt) |
|
ss.fit(X, y) |
|
assert pipe.n_features_in_ == ss.n_features_in_ == 2 |
|
assert not hasattr(gbdt, "n_features_in_") |
|
|
|
|
|
def test_n_features_in_feature_union(): |
|
|
|
|
|
X = [[1, 2], [3, 4], [5, 6]] |
|
y = [0, 1, 2] |
|
|
|
ss = StandardScaler() |
|
fu = make_union(ss) |
|
assert not hasattr(fu, "n_features_in_") |
|
fu.fit(X, y) |
|
assert fu.n_features_in_ == ss.n_features_in_ == 2 |
|
|
|
|
|
|
|
ss = StandardScaler() |
|
fu = make_union(ss) |
|
ss.fit(X, y) |
|
assert fu.n_features_in_ == ss.n_features_in_ == 2 |
|
|
|
|
|
def test_feature_union_fit_params(): |
|
|
|
class DummyTransformer(TransformerMixin, BaseEstimator): |
|
def fit(self, X, y=None, **fit_params): |
|
if fit_params != {"a": 0}: |
|
raise ValueError |
|
return self |
|
|
|
def transform(self, X, y=None): |
|
return X |
|
|
|
X, y = iris.data, iris.target |
|
t = FeatureUnion([("dummy0", DummyTransformer()), ("dummy1", DummyTransformer())]) |
|
with pytest.raises(ValueError): |
|
t.fit(X, y) |
|
|
|
with pytest.raises(ValueError): |
|
t.fit_transform(X, y) |
|
|
|
t.fit(X, y, a=0) |
|
t.fit_transform(X, y, a=0) |
|
|
|
|
|
def test_feature_union_fit_params_without_fit_transform(): |
|
|
|
|
|
|
|
class DummyTransformer(ConsumingNoFitTransformTransformer): |
|
def fit(self, X, y=None, **fit_params): |
|
if fit_params != {"metadata": 1}: |
|
raise ValueError |
|
return self |
|
|
|
X, y = iris.data, iris.target |
|
t = FeatureUnion( |
|
[ |
|
("nofittransform0", DummyTransformer()), |
|
("nofittransform1", DummyTransformer()), |
|
] |
|
) |
|
|
|
with pytest.raises(ValueError): |
|
t.fit_transform(X, y, metadata=0) |
|
|
|
t.fit_transform(X, y, metadata=1) |
|
|
|
|
|
def test_pipeline_missing_values_leniency(): |
|
|
|
|
|
X, y = iris.data.copy(), iris.target.copy() |
|
mask = np.random.choice([1, 0], X.shape, p=[0.1, 0.9]).astype(bool) |
|
X[mask] = np.nan |
|
pipe = make_pipeline(SimpleImputer(), LogisticRegression()) |
|
assert pipe.fit(X, y).score(X, y) > 0.4 |
|
|
|
|
|
def test_feature_union_warns_unknown_transformer_weight(): |
|
|
|
|
|
X = [[1, 2], [3, 4], [5, 6]] |
|
y = [0, 1, 2] |
|
|
|
transformer_list = [("transf", Transf())] |
|
|
|
weights = {"transformer": 1} |
|
expected_msg = ( |
|
'Attempting to weight transformer "transformer", ' |
|
"but it is not present in transformer_list." |
|
) |
|
union = FeatureUnion(transformer_list, transformer_weights=weights) |
|
with pytest.raises(ValueError, match=expected_msg): |
|
union.fit(X, y) |
|
|
|
|
|
@pytest.mark.parametrize("passthrough", [None, "passthrough"]) |
|
def test_pipeline_get_tags_none(passthrough): |
|
|
|
|
|
|
|
|
|
pipe = make_pipeline(passthrough, SVC()) |
|
assert not pipe.__sklearn_tags__().input_tags.pairwise |
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("Predictor", [MinimalRegressor, MinimalClassifier]) |
|
def test_search_cv_using_minimal_compatible_estimator(Predictor): |
|
|
|
|
|
rng = np.random.RandomState(0) |
|
X, y = rng.randn(25, 2), np.array([0] * 5 + [1] * 20) |
|
|
|
model = Pipeline( |
|
[("transformer", MinimalTransformer()), ("predictor", Predictor())] |
|
) |
|
model.fit(X, y) |
|
|
|
y_pred = model.predict(X) |
|
if is_classifier(model): |
|
assert_array_equal(y_pred, 1) |
|
assert model.score(X, y) == pytest.approx(accuracy_score(y, y_pred)) |
|
else: |
|
assert_allclose(y_pred, y.mean()) |
|
assert model.score(X, y) == pytest.approx(r2_score(y, y_pred)) |
|
|
|
|
|
def test_pipeline_check_if_fitted(): |
|
class Estimator(BaseEstimator): |
|
def fit(self, X, y): |
|
self.fitted_ = True |
|
return self |
|
|
|
pipeline = Pipeline([("clf", Estimator())]) |
|
with pytest.raises(NotFittedError): |
|
check_is_fitted(pipeline) |
|
pipeline.fit(iris.data, iris.target) |
|
check_is_fitted(pipeline) |
|
|
|
|
|
def test_feature_union_check_if_fitted(): |
|
"""Check __sklearn_is_fitted__ is defined correctly.""" |
|
|
|
X = [[1, 2], [3, 4], [5, 6]] |
|
y = [0, 1, 2] |
|
|
|
union = FeatureUnion([("clf", MinimalTransformer())]) |
|
with pytest.raises(NotFittedError): |
|
check_is_fitted(union) |
|
|
|
union.fit(X, y) |
|
check_is_fitted(union) |
|
|
|
|
|
union = FeatureUnion([("pass", "passthrough")]) |
|
check_is_fitted(union) |
|
|
|
union = FeatureUnion([("clf", MinimalTransformer()), ("pass", "passthrough")]) |
|
with pytest.raises(NotFittedError): |
|
check_is_fitted(union) |
|
|
|
union.fit(X, y) |
|
check_is_fitted(union) |
|
|
|
|
|
def test_pipeline_get_feature_names_out_passes_names_through(): |
|
"""Check that pipeline passes names through. |
|
|
|
Non-regresion test for #21349. |
|
""" |
|
X, y = iris.data, iris.target |
|
|
|
class AddPrefixStandardScalar(StandardScaler): |
|
def get_feature_names_out(self, input_features=None): |
|
names = super().get_feature_names_out(input_features=input_features) |
|
return np.asarray([f"my_prefix_{name}" for name in names], dtype=object) |
|
|
|
pipe = make_pipeline(AddPrefixStandardScalar(), StandardScaler()) |
|
pipe.fit(X, y) |
|
|
|
input_names = iris.feature_names |
|
feature_names_out = pipe.get_feature_names_out(input_names) |
|
|
|
assert_array_equal(feature_names_out, [f"my_prefix_{name}" for name in input_names]) |
|
|
|
|
|
def test_pipeline_set_output_integration(): |
|
"""Test pipeline's set_output with feature names.""" |
|
pytest.importorskip("pandas") |
|
|
|
X, y = load_iris(as_frame=True, return_X_y=True) |
|
|
|
pipe = make_pipeline(StandardScaler(), LogisticRegression()) |
|
pipe.set_output(transform="pandas") |
|
pipe.fit(X, y) |
|
|
|
feature_names_in_ = pipe[:-1].get_feature_names_out() |
|
log_reg_feature_names = pipe[-1].feature_names_in_ |
|
|
|
assert_array_equal(feature_names_in_, log_reg_feature_names) |
|
|
|
|
|
def test_feature_union_set_output(): |
|
"""Test feature union with set_output API.""" |
|
pd = pytest.importorskip("pandas") |
|
|
|
X, _ = load_iris(as_frame=True, return_X_y=True) |
|
X_train, X_test = train_test_split(X, random_state=0) |
|
union = FeatureUnion([("scalar", StandardScaler()), ("pca", PCA())]) |
|
union.set_output(transform="pandas") |
|
union.fit(X_train) |
|
|
|
X_trans = union.transform(X_test) |
|
assert isinstance(X_trans, pd.DataFrame) |
|
assert_array_equal(X_trans.columns, union.get_feature_names_out()) |
|
assert_array_equal(X_trans.index, X_test.index) |
|
|
|
|
|
def test_feature_union_getitem(): |
|
"""Check FeatureUnion.__getitem__ returns expected results.""" |
|
scalar = StandardScaler() |
|
pca = PCA() |
|
union = FeatureUnion( |
|
[ |
|
("scalar", scalar), |
|
("pca", pca), |
|
("pass", "passthrough"), |
|
("drop_me", "drop"), |
|
] |
|
) |
|
assert union["scalar"] is scalar |
|
assert union["pca"] is pca |
|
assert union["pass"] == "passthrough" |
|
assert union["drop_me"] == "drop" |
|
|
|
|
|
@pytest.mark.parametrize("key", [0, slice(0, 2)]) |
|
def test_feature_union_getitem_error(key): |
|
"""Raise error when __getitem__ gets a non-string input.""" |
|
|
|
union = FeatureUnion([("scalar", StandardScaler()), ("pca", PCA())]) |
|
|
|
msg = "Only string keys are supported" |
|
with pytest.raises(KeyError, match=msg): |
|
union[key] |
|
|
|
|
|
def test_feature_union_feature_names_in_(): |
|
"""Ensure feature union has `.feature_names_in_` attribute if `X` has a |
|
`columns` attribute. |
|
|
|
Test for #24754. |
|
""" |
|
pytest.importorskip("pandas") |
|
|
|
X, _ = load_iris(as_frame=True, return_X_y=True) |
|
|
|
|
|
|
|
scaler = StandardScaler() |
|
scaler.fit(X) |
|
union = FeatureUnion([("scale", scaler)]) |
|
assert hasattr(union, "feature_names_in_") |
|
assert_array_equal(X.columns, union.feature_names_in_) |
|
assert_array_equal(scaler.feature_names_in_, union.feature_names_in_) |
|
|
|
|
|
union = FeatureUnion([("pass", "passthrough")]) |
|
union.fit(X) |
|
assert hasattr(union, "feature_names_in_") |
|
assert_array_equal(X.columns, union.feature_names_in_) |
|
|
|
|
|
X_array = X.to_numpy() |
|
union = FeatureUnion([("pass", "passthrough")]) |
|
union.fit(X_array) |
|
assert not hasattr(union, "feature_names_in_") |
|
|
|
|
|
|
|
def test_pipeline_inverse_transform_Xt_deprecation(): |
|
X = np.random.RandomState(0).normal(size=(10, 5)) |
|
pipe = Pipeline([("pca", PCA(n_components=2))]) |
|
X = pipe.fit_transform(X) |
|
|
|
with pytest.raises(TypeError, match="Missing required positional argument"): |
|
pipe.inverse_transform() |
|
|
|
with pytest.raises(TypeError, match="Cannot use both X and Xt. Use X only"): |
|
pipe.inverse_transform(X=X, Xt=X) |
|
|
|
with warnings.catch_warnings(record=True): |
|
warnings.simplefilter("error") |
|
pipe.inverse_transform(X) |
|
|
|
with pytest.warns(FutureWarning, match="Xt was renamed X in version 1.5"): |
|
pipe.inverse_transform(Xt=X) |
|
|
|
|
|
|
|
|
|
|
|
|
|
@config_context(enable_metadata_routing=True) |
|
@pytest.mark.parametrize("method", ["fit", "fit_transform"]) |
|
def test_transform_input_pipeline(method): |
|
"""Test that with transform_input, data is correctly transformed for each step.""" |
|
|
|
def get_transformer(registry, sample_weight, metadata): |
|
"""Get a transformer with requests set.""" |
|
return ( |
|
ConsumingTransformer(registry=registry) |
|
.set_fit_request(sample_weight=sample_weight, metadata=metadata) |
|
.set_transform_request(sample_weight=sample_weight, metadata=metadata) |
|
) |
|
|
|
def get_pipeline(): |
|
"""Get a pipeline and corresponding registries. |
|
|
|
The pipeline has 4 steps, with different request values set to test different |
|
cases. One is aliased. |
|
""" |
|
registry_1, registry_2, registry_3, registry_4 = ( |
|
_Registry(), |
|
_Registry(), |
|
_Registry(), |
|
_Registry(), |
|
) |
|
pipe = make_pipeline( |
|
get_transformer(registry_1, sample_weight=True, metadata=True), |
|
get_transformer(registry_2, sample_weight=False, metadata=False), |
|
get_transformer(registry_3, sample_weight=True, metadata=True), |
|
get_transformer(registry_4, sample_weight="other_weights", metadata=True), |
|
transform_input=["sample_weight"], |
|
) |
|
return pipe, registry_1, registry_2, registry_3, registry_4 |
|
|
|
def check_metadata(registry, methods, **metadata): |
|
"""Check that the right metadata was recorded for the given methods.""" |
|
assert registry |
|
for estimator in registry: |
|
for method in methods: |
|
check_recorded_metadata( |
|
estimator, |
|
method=method, |
|
parent=method, |
|
**metadata, |
|
) |
|
|
|
X = np.array([[1, 2], [3, 4]]) |
|
y = np.array([0, 1]) |
|
sample_weight = np.array([[1, 2]]) |
|
other_weights = np.array([[30, 40]]) |
|
metadata = np.array([[100, 200]]) |
|
|
|
pipe, registry_1, registry_2, registry_3, registry_4 = get_pipeline() |
|
pipe.fit( |
|
X, |
|
y, |
|
sample_weight=sample_weight, |
|
other_weights=other_weights, |
|
metadata=metadata, |
|
) |
|
|
|
check_metadata( |
|
registry_1, ["fit", "transform"], sample_weight=sample_weight, metadata=metadata |
|
) |
|
check_metadata(registry_2, ["fit", "transform"]) |
|
check_metadata( |
|
registry_3, |
|
["fit", "transform"], |
|
sample_weight=sample_weight + 2, |
|
metadata=metadata, |
|
) |
|
check_metadata( |
|
registry_4, |
|
method.split("_"), |
|
sample_weight=other_weights + 3, |
|
metadata=metadata, |
|
) |
|
|
|
|
|
@config_context(enable_metadata_routing=True) |
|
def test_transform_input_explicit_value_check(): |
|
"""Test that the right transformed values are passed to `fit`.""" |
|
|
|
class Transformer(TransformerMixin, BaseEstimator): |
|
def fit(self, X, y): |
|
self.fitted_ = True |
|
return self |
|
|
|
def transform(self, X): |
|
return X + 1 |
|
|
|
class Estimator(ClassifierMixin, BaseEstimator): |
|
def fit(self, X, y, X_val=None, y_val=None): |
|
assert_array_equal(X, np.array([[1, 2]])) |
|
assert_array_equal(y, np.array([0, 1])) |
|
assert_array_equal(X_val, np.array([[2, 3]])) |
|
assert_array_equal(y_val, np.array([0, 1])) |
|
return self |
|
|
|
X = np.array([[0, 1]]) |
|
y = np.array([0, 1]) |
|
X_val = np.array([[1, 2]]) |
|
y_val = np.array([0, 1]) |
|
pipe = Pipeline( |
|
[ |
|
("transformer", Transformer()), |
|
("estimator", Estimator().set_fit_request(X_val=True, y_val=True)), |
|
], |
|
transform_input=["X_val"], |
|
) |
|
pipe.fit(X, y, X_val=X_val, y_val=y_val) |
|
|
|
|
|
def test_transform_input_no_slep6(): |
|
"""Make sure the right error is raised if slep6 is not enabled.""" |
|
X = np.array([[1, 2], [3, 4]]) |
|
y = np.array([0, 1]) |
|
msg = "The `transform_input` parameter can only be set if metadata" |
|
with pytest.raises(ValueError, match=msg): |
|
make_pipeline(DummyTransf(), transform_input=["blah"]).fit(X, y) |
|
|
|
|
|
@config_context(enable_metadata_routing=True) |
|
def test_transform_tuple_input(): |
|
"""Test that if metadata is a tuple of arrays, both arrays are transformed.""" |
|
|
|
class Estimator(ClassifierMixin, BaseEstimator): |
|
def fit(self, X, y, X_val=None, y_val=None): |
|
assert isinstance(X_val, tuple) |
|
assert isinstance(y_val, tuple) |
|
|
|
assert_array_equal(X_val[0], np.array([[2, 3]])) |
|
assert_array_equal(y_val[0], np.array([0, 1])) |
|
assert_array_equal(X_val[1], np.array([[11, 12]])) |
|
assert_array_equal(y_val[1], np.array([1, 2])) |
|
self.fitted_ = True |
|
return self |
|
|
|
class Transformer(TransformerMixin, BaseEstimator): |
|
def fit(self, X, y): |
|
self.fitted_ = True |
|
return self |
|
|
|
def transform(self, X): |
|
return X + 1 |
|
|
|
X = np.array([[1, 2]]) |
|
y = np.array([0, 1]) |
|
X_val0 = np.array([[1, 2]]) |
|
y_val0 = np.array([0, 1]) |
|
X_val1 = np.array([[10, 11]]) |
|
y_val1 = np.array([1, 2]) |
|
pipe = Pipeline( |
|
[ |
|
("transformer", Transformer()), |
|
("estimator", Estimator().set_fit_request(X_val=True, y_val=True)), |
|
], |
|
transform_input=["X_val"], |
|
) |
|
pipe.fit(X, y, X_val=(X_val0, X_val1), y_val=(y_val0, y_val1)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize( |
|
"method", |
|
[ |
|
"predict", |
|
"predict_proba", |
|
"predict_log_proba", |
|
"decision_function", |
|
"score", |
|
"score_samples", |
|
"transform", |
|
"inverse_transform", |
|
], |
|
) |
|
def test_pipeline_warns_not_fitted(method): |
|
class StatelessEstimator(BaseEstimator): |
|
"""Stateless estimator that doesn't check if it's fitted. |
|
|
|
Stateless estimators that don't require fit, should properly set the |
|
`requires_fit` flag and implement a `__sklearn_check_is_fitted__` returning |
|
`True`. |
|
""" |
|
|
|
def fit(self, X, y): |
|
return self |
|
|
|
def transform(self, X): |
|
return X |
|
|
|
def predict(self, X): |
|
return np.ones(len(X)) |
|
|
|
def predict_proba(self, X): |
|
return np.ones(len(X)) |
|
|
|
def predict_log_proba(self, X): |
|
return np.zeros(len(X)) |
|
|
|
def decision_function(self, X): |
|
return np.ones(len(X)) |
|
|
|
def score(self, X, y): |
|
return 1 |
|
|
|
def score_samples(self, X): |
|
return np.ones(len(X)) |
|
|
|
def inverse_transform(self, X): |
|
return X |
|
|
|
pipe = Pipeline([("estimator", StatelessEstimator())]) |
|
with pytest.warns(FutureWarning, match="This Pipeline instance is not fitted yet."): |
|
getattr(pipe, method)([[1]]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
class SimpleEstimator(BaseEstimator): |
|
|
|
|
|
def __sklearn_is_fitted__(self): |
|
return True |
|
|
|
def fit(self, X, y, sample_weight=None, prop=None): |
|
assert sample_weight is not None, sample_weight |
|
assert prop is not None, prop |
|
return self |
|
|
|
def fit_transform(self, X, y, sample_weight=None, prop=None): |
|
assert sample_weight is not None |
|
assert prop is not None |
|
return X + 1 |
|
|
|
def fit_predict(self, X, y, sample_weight=None, prop=None): |
|
assert sample_weight is not None |
|
assert prop is not None |
|
return np.ones(len(X)) |
|
|
|
def predict(self, X, sample_weight=None, prop=None): |
|
assert sample_weight is not None |
|
assert prop is not None |
|
return np.ones(len(X)) |
|
|
|
def predict_proba(self, X, sample_weight=None, prop=None): |
|
assert sample_weight is not None |
|
assert prop is not None |
|
return np.ones(len(X)) |
|
|
|
def predict_log_proba(self, X, sample_weight=None, prop=None): |
|
assert sample_weight is not None |
|
assert prop is not None |
|
return np.zeros(len(X)) |
|
|
|
def decision_function(self, X, sample_weight=None, prop=None): |
|
assert sample_weight is not None |
|
assert prop is not None |
|
return np.ones(len(X)) |
|
|
|
def score(self, X, y, sample_weight=None, prop=None): |
|
assert sample_weight is not None |
|
assert prop is not None |
|
return 1 |
|
|
|
def transform(self, X, sample_weight=None, prop=None): |
|
assert sample_weight is not None |
|
assert prop is not None |
|
return X + 1 |
|
|
|
def inverse_transform(self, X, sample_weight=None, prop=None): |
|
assert sample_weight is not None |
|
assert prop is not None |
|
return X - 1 |
|
|
|
|
|
|
|
@pytest.mark.parametrize("method", sorted(set(METHODS) - {"split", "partial_fit"})) |
|
@config_context(enable_metadata_routing=True) |
|
def test_metadata_routing_for_pipeline(method): |
|
"""Test that metadata is routed correctly for pipelines.""" |
|
|
|
def set_request(est, method, **kwarg): |
|
"""Set requests for a given method. |
|
|
|
If the given method is a composite method, set the same requests for |
|
all the methods that compose it. |
|
""" |
|
if method in COMPOSITE_METHODS: |
|
methods = COMPOSITE_METHODS[method] |
|
else: |
|
methods = [method] |
|
|
|
for method in methods: |
|
getattr(est, f"set_{method}_request")(**kwarg) |
|
return est |
|
|
|
X, y = np.array([[1]]), np.array([1]) |
|
sample_weight, prop, metadata = [1], "a", "b" |
|
|
|
|
|
est = SimpleEstimator() |
|
est = set_request(est, method, sample_weight=True, prop=True) |
|
est = set_request(est, "fit", sample_weight=True, prop=True) |
|
trs = ( |
|
ConsumingTransformer() |
|
.set_fit_request(sample_weight=True, metadata=True) |
|
.set_transform_request(sample_weight=True, metadata=True) |
|
.set_inverse_transform_request(sample_weight=True, metadata=True) |
|
) |
|
pipeline = Pipeline([("trs", trs), ("estimator", est)]) |
|
|
|
if "fit" not in method: |
|
pipeline = pipeline.fit(X, y, sample_weight=sample_weight, prop=prop) |
|
|
|
try: |
|
getattr(pipeline, method)( |
|
X, y, sample_weight=sample_weight, prop=prop, metadata=metadata |
|
) |
|
except TypeError: |
|
|
|
getattr(pipeline, method)( |
|
X, sample_weight=sample_weight, prop=prop, metadata=metadata |
|
) |
|
|
|
|
|
|
|
check_recorded_metadata( |
|
obj=trs, |
|
method="fit", |
|
parent="fit", |
|
sample_weight=sample_weight, |
|
metadata=metadata, |
|
) |
|
check_recorded_metadata( |
|
obj=trs, |
|
method="transform", |
|
parent="transform", |
|
sample_weight=sample_weight, |
|
metadata=metadata, |
|
) |
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("method", sorted(set(METHODS) - {"split", "partial_fit"})) |
|
@config_context(enable_metadata_routing=True) |
|
def test_metadata_routing_error_for_pipeline(method): |
|
"""Test that metadata is not routed for pipelines when not requested.""" |
|
X, y = [[1]], [1] |
|
sample_weight, prop = [1], "a" |
|
est = SimpleEstimator() |
|
|
|
pipeline = Pipeline([("estimator", est)]) |
|
error_message = ( |
|
"[sample_weight, prop] are passed but are not explicitly set as requested" |
|
f" or not requested for SimpleEstimator.{method}" |
|
) |
|
with pytest.raises(ValueError, match=re.escape(error_message)): |
|
try: |
|
|
|
getattr(pipeline, method)(X, y, sample_weight=sample_weight, prop=prop) |
|
except TypeError: |
|
|
|
|
|
getattr(pipeline, method)(X, sample_weight=sample_weight, prop=prop) |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"method", ["decision_function", "transform", "inverse_transform"] |
|
) |
|
def test_routing_passed_metadata_not_supported(method): |
|
"""Test that the right error message is raised when metadata is passed while |
|
not supported when `enable_metadata_routing=False`.""" |
|
|
|
pipe = Pipeline([("estimator", SimpleEstimator())]) |
|
|
|
with pytest.raises( |
|
ValueError, match="is only supported if enable_metadata_routing=True" |
|
): |
|
getattr(pipe, method)([[1]], sample_weight=[1], prop="a") |
|
|
|
|
|
@config_context(enable_metadata_routing=True) |
|
def test_pipeline_with_estimator_with_len(): |
|
"""Test that pipeline works with estimators that have a `__len__` method.""" |
|
pipe = Pipeline( |
|
[("trs", RandomTreesEmbedding()), ("estimator", RandomForestClassifier())] |
|
) |
|
pipe.fit([[1]], [1]) |
|
pipe.predict([[1]]) |
|
|
|
|
|
@pytest.mark.parametrize("last_step", [None, "passthrough"]) |
|
@config_context(enable_metadata_routing=True) |
|
def test_pipeline_with_no_last_step(last_step): |
|
"""Test that the pipeline works when there is not last step. |
|
|
|
It should just ignore and pass through the data on transform. |
|
""" |
|
pipe = Pipeline([("trs", FunctionTransformer()), ("estimator", last_step)]) |
|
assert pipe.fit([[1]], [1]).transform([[1], [2], [3]]) == [[1], [2], [3]] |
|
|
|
|
|
@config_context(enable_metadata_routing=True) |
|
def test_feature_union_metadata_routing_error(): |
|
"""Test that the right error is raised when metadata is not requested.""" |
|
X = np.array([[0, 1], [2, 2], [4, 6]]) |
|
y = [1, 2, 3] |
|
sample_weight, metadata = [1, 1, 1], "a" |
|
|
|
|
|
feature_union = FeatureUnion([("sub_transformer", ConsumingTransformer())]) |
|
|
|
error_message = ( |
|
"[sample_weight, metadata] are passed but are not explicitly set as requested" |
|
f" or not requested for {ConsumingTransformer.__name__}.fit" |
|
) |
|
|
|
with pytest.raises(UnsetMetadataPassedError, match=re.escape(error_message)): |
|
feature_union.fit(X, y, sample_weight=sample_weight, metadata=metadata) |
|
|
|
|
|
feature_union = FeatureUnion( |
|
[ |
|
( |
|
"sub_transformer", |
|
ConsumingTransformer().set_fit_request( |
|
sample_weight=True, metadata=True |
|
), |
|
) |
|
] |
|
) |
|
|
|
error_message = ( |
|
"[sample_weight, metadata] are passed but are not explicitly set as requested " |
|
f"or not requested for {ConsumingTransformer.__name__}.transform" |
|
) |
|
|
|
with pytest.raises(UnsetMetadataPassedError, match=re.escape(error_message)): |
|
feature_union.fit( |
|
X, y, sample_weight=sample_weight, metadata=metadata |
|
).transform(X, sample_weight=sample_weight, metadata=metadata) |
|
|
|
|
|
@config_context(enable_metadata_routing=True) |
|
def test_feature_union_get_metadata_routing_without_fit(): |
|
"""Test that get_metadata_routing() works regardless of the Child's |
|
consumption of any metadata.""" |
|
feature_union = FeatureUnion([("sub_transformer", ConsumingTransformer())]) |
|
feature_union.get_metadata_routing() |
|
|
|
|
|
@config_context(enable_metadata_routing=True) |
|
@pytest.mark.parametrize( |
|
"transformer", [ConsumingTransformer, ConsumingNoFitTransformTransformer] |
|
) |
|
def test_feature_union_metadata_routing(transformer): |
|
"""Test that metadata is routed correctly for FeatureUnion.""" |
|
X = np.array([[0, 1], [2, 2], [4, 6]]) |
|
y = [1, 2, 3] |
|
sample_weight, metadata = [1, 1, 1], "a" |
|
|
|
feature_union = FeatureUnion( |
|
[ |
|
( |
|
"sub_trans1", |
|
transformer(registry=_Registry()) |
|
.set_fit_request(sample_weight=True, metadata=True) |
|
.set_transform_request(sample_weight=True, metadata=True), |
|
), |
|
( |
|
"sub_trans2", |
|
transformer(registry=_Registry()) |
|
.set_fit_request(sample_weight=True, metadata=True) |
|
.set_transform_request(sample_weight=True, metadata=True), |
|
), |
|
] |
|
) |
|
|
|
kwargs = {"sample_weight": sample_weight, "metadata": metadata} |
|
feature_union.fit(X, y, **kwargs) |
|
feature_union.fit_transform(X, y, **kwargs) |
|
feature_union.fit(X, y, **kwargs).transform(X, **kwargs) |
|
|
|
for transformer in feature_union.transformer_list: |
|
|
|
registry = transformer[1].registry |
|
assert len(registry) |
|
for sub_trans in registry: |
|
check_recorded_metadata( |
|
obj=sub_trans, |
|
method="fit", |
|
parent="fit", |
|
**kwargs, |
|
) |
|
|
|
|
|
|
|
|
|
|