|
"""Random projection transformers. |
|
|
|
Random projections are a simple and computationally efficient way to |
|
reduce the dimensionality of the data by trading a controlled amount |
|
of accuracy (as additional variance) for faster processing times and |
|
smaller model sizes. |
|
|
|
The dimensions and distribution of random projections matrices are |
|
controlled so as to preserve the pairwise distances between any two |
|
samples of the dataset. |
|
|
|
The main theoretical result behind the efficiency of random projection is the |
|
`Johnson-Lindenstrauss lemma (quoting Wikipedia) |
|
<https://en.wikipedia.org/wiki/Johnson%E2%80%93Lindenstrauss_lemma>`_: |
|
|
|
In mathematics, the Johnson-Lindenstrauss lemma is a result |
|
concerning low-distortion embeddings of points from high-dimensional |
|
into low-dimensional Euclidean space. The lemma states that a small set |
|
of points in a high-dimensional space can be embedded into a space of |
|
much lower dimension in such a way that distances between the points are |
|
nearly preserved. The map used for the embedding is at least Lipschitz, |
|
and can even be taken to be an orthogonal projection. |
|
""" |
|
|
|
|
|
|
|
|
|
import warnings |
|
from abc import ABCMeta, abstractmethod |
|
from numbers import Integral, Real |
|
|
|
import numpy as np |
|
import scipy.sparse as sp |
|
from scipy import linalg |
|
|
|
from .base import ( |
|
BaseEstimator, |
|
ClassNamePrefixFeaturesOutMixin, |
|
TransformerMixin, |
|
_fit_context, |
|
) |
|
from .exceptions import DataDimensionalityWarning |
|
from .utils import check_random_state |
|
from .utils._param_validation import Interval, StrOptions, validate_params |
|
from .utils.extmath import safe_sparse_dot |
|
from .utils.random import sample_without_replacement |
|
from .utils.validation import check_array, check_is_fitted, validate_data |
|
|
|
__all__ = [ |
|
"SparseRandomProjection", |
|
"GaussianRandomProjection", |
|
"johnson_lindenstrauss_min_dim", |
|
] |
|
|
|
|
|
@validate_params( |
|
{ |
|
"n_samples": ["array-like", Interval(Real, 1, None, closed="left")], |
|
"eps": ["array-like", Interval(Real, 0, 1, closed="neither")], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def johnson_lindenstrauss_min_dim(n_samples, *, eps=0.1): |
|
"""Find a 'safe' number of components to randomly project to. |
|
|
|
The distortion introduced by a random projection `p` only changes the |
|
distance between two points by a factor (1 +- eps) in a euclidean space |
|
with good probability. The projection `p` is an eps-embedding as defined |
|
by: |
|
|
|
.. code-block:: text |
|
|
|
(1 - eps) ||u - v||^2 < ||p(u) - p(v)||^2 < (1 + eps) ||u - v||^2 |
|
|
|
Where u and v are any rows taken from a dataset of shape (n_samples, |
|
n_features), eps is in ]0, 1[ and p is a projection by a random Gaussian |
|
N(0, 1) matrix of shape (n_components, n_features) (or a sparse |
|
Achlioptas matrix). |
|
|
|
The minimum number of components to guarantee the eps-embedding is |
|
given by: |
|
|
|
.. code-block:: text |
|
|
|
n_components >= 4 log(n_samples) / (eps^2 / 2 - eps^3 / 3) |
|
|
|
Note that the number of dimensions is independent of the original |
|
number of features but instead depends on the size of the dataset: |
|
the larger the dataset, the higher is the minimal dimensionality of |
|
an eps-embedding. |
|
|
|
Read more in the :ref:`User Guide <johnson_lindenstrauss>`. |
|
|
|
Parameters |
|
---------- |
|
n_samples : int or array-like of int |
|
Number of samples that should be an integer greater than 0. If an array |
|
is given, it will compute a safe number of components array-wise. |
|
|
|
eps : float or array-like of shape (n_components,), dtype=float, \ |
|
default=0.1 |
|
Maximum distortion rate in the range (0, 1) as defined by the |
|
Johnson-Lindenstrauss lemma. If an array is given, it will compute a |
|
safe number of components array-wise. |
|
|
|
Returns |
|
------- |
|
n_components : int or ndarray of int |
|
The minimal number of components to guarantee with good probability |
|
an eps-embedding with n_samples. |
|
|
|
References |
|
---------- |
|
|
|
.. [1] https://en.wikipedia.org/wiki/Johnson%E2%80%93Lindenstrauss_lemma |
|
|
|
.. [2] `Sanjoy Dasgupta and Anupam Gupta, 1999, |
|
"An elementary proof of the Johnson-Lindenstrauss Lemma." |
|
<https://citeseerx.ist.psu.edu/doc_view/pid/95cd464d27c25c9c8690b378b894d337cdf021f9>`_ |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.random_projection import johnson_lindenstrauss_min_dim |
|
>>> johnson_lindenstrauss_min_dim(1e6, eps=0.5) |
|
np.int64(663) |
|
|
|
>>> johnson_lindenstrauss_min_dim(1e6, eps=[0.5, 0.1, 0.01]) |
|
array([ 663, 11841, 1112658]) |
|
|
|
>>> johnson_lindenstrauss_min_dim([1e4, 1e5, 1e6], eps=0.1) |
|
array([ 7894, 9868, 11841]) |
|
""" |
|
eps = np.asarray(eps) |
|
n_samples = np.asarray(n_samples) |
|
|
|
if np.any(eps <= 0.0) or np.any(eps >= 1): |
|
raise ValueError("The JL bound is defined for eps in ]0, 1[, got %r" % eps) |
|
|
|
if np.any(n_samples <= 0): |
|
raise ValueError( |
|
"The JL bound is defined for n_samples greater than zero, got %r" |
|
% n_samples |
|
) |
|
|
|
denominator = (eps**2 / 2) - (eps**3 / 3) |
|
return (4 * np.log(n_samples) / denominator).astype(np.int64) |
|
|
|
|
|
def _check_density(density, n_features): |
|
"""Factorize density check according to Li et al.""" |
|
if density == "auto": |
|
density = 1 / np.sqrt(n_features) |
|
|
|
elif density <= 0 or density > 1: |
|
raise ValueError("Expected density in range ]0, 1], got: %r" % density) |
|
return density |
|
|
|
|
|
def _check_input_size(n_components, n_features): |
|
"""Factorize argument checking for random matrix generation.""" |
|
if n_components <= 0: |
|
raise ValueError( |
|
"n_components must be strictly positive, got %d" % n_components |
|
) |
|
if n_features <= 0: |
|
raise ValueError("n_features must be strictly positive, got %d" % n_features) |
|
|
|
|
|
def _gaussian_random_matrix(n_components, n_features, random_state=None): |
|
"""Generate a dense Gaussian random matrix. |
|
|
|
The components of the random matrix are drawn from |
|
|
|
N(0, 1.0 / n_components). |
|
|
|
Read more in the :ref:`User Guide <gaussian_random_matrix>`. |
|
|
|
Parameters |
|
---------- |
|
n_components : int, |
|
Dimensionality of the target projection space. |
|
|
|
n_features : int, |
|
Dimensionality of the original source space. |
|
|
|
random_state : int, RandomState instance or None, default=None |
|
Controls the pseudo random number generator used to generate the matrix |
|
at fit time. |
|
Pass an int for reproducible output across multiple function calls. |
|
See :term:`Glossary <random_state>`. |
|
|
|
Returns |
|
------- |
|
components : ndarray of shape (n_components, n_features) |
|
The generated Gaussian random matrix. |
|
|
|
See Also |
|
-------- |
|
GaussianRandomProjection |
|
""" |
|
_check_input_size(n_components, n_features) |
|
rng = check_random_state(random_state) |
|
components = rng.normal( |
|
loc=0.0, scale=1.0 / np.sqrt(n_components), size=(n_components, n_features) |
|
) |
|
return components |
|
|
|
|
|
def _sparse_random_matrix(n_components, n_features, density="auto", random_state=None): |
|
"""Generalized Achlioptas random sparse matrix for random projection. |
|
|
|
Setting density to 1 / 3 will yield the original matrix by Dimitris |
|
Achlioptas while setting a lower value will yield the generalization |
|
by Ping Li et al. |
|
|
|
If we note :math:`s = 1 / density`, the components of the random matrix are |
|
drawn from: |
|
|
|
- -sqrt(s) / sqrt(n_components) with probability 1 / 2s |
|
- 0 with probability 1 - 1 / s |
|
- +sqrt(s) / sqrt(n_components) with probability 1 / 2s |
|
|
|
Read more in the :ref:`User Guide <sparse_random_matrix>`. |
|
|
|
Parameters |
|
---------- |
|
n_components : int, |
|
Dimensionality of the target projection space. |
|
|
|
n_features : int, |
|
Dimensionality of the original source space. |
|
|
|
density : float or 'auto', default='auto' |
|
Ratio of non-zero component in the random projection matrix in the |
|
range `(0, 1]` |
|
|
|
If density = 'auto', the value is set to the minimum density |
|
as recommended by Ping Li et al.: 1 / sqrt(n_features). |
|
|
|
Use density = 1 / 3.0 if you want to reproduce the results from |
|
Achlioptas, 2001. |
|
|
|
random_state : int, RandomState instance or None, default=None |
|
Controls the pseudo random number generator used to generate the matrix |
|
at fit time. |
|
Pass an int for reproducible output across multiple function calls. |
|
See :term:`Glossary <random_state>`. |
|
|
|
Returns |
|
------- |
|
components : {ndarray, sparse matrix} of shape (n_components, n_features) |
|
The generated Gaussian random matrix. Sparse matrix will be of CSR |
|
format. |
|
|
|
See Also |
|
-------- |
|
SparseRandomProjection |
|
|
|
References |
|
---------- |
|
|
|
.. [1] Ping Li, T. Hastie and K. W. Church, 2006, |
|
"Very Sparse Random Projections". |
|
https://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf |
|
|
|
.. [2] D. Achlioptas, 2001, "Database-friendly random projections", |
|
https://cgi.di.uoa.gr/~optas/papers/jl.pdf |
|
|
|
""" |
|
_check_input_size(n_components, n_features) |
|
density = _check_density(density, n_features) |
|
rng = check_random_state(random_state) |
|
|
|
if density == 1: |
|
|
|
components = rng.binomial(1, 0.5, (n_components, n_features)) * 2 - 1 |
|
return 1 / np.sqrt(n_components) * components |
|
|
|
else: |
|
|
|
indices = [] |
|
offset = 0 |
|
indptr = [offset] |
|
for _ in range(n_components): |
|
|
|
n_nonzero_i = rng.binomial(n_features, density) |
|
indices_i = sample_without_replacement( |
|
n_features, n_nonzero_i, random_state=rng |
|
) |
|
indices.append(indices_i) |
|
offset += n_nonzero_i |
|
indptr.append(offset) |
|
|
|
indices = np.concatenate(indices) |
|
|
|
|
|
data = rng.binomial(1, 0.5, size=np.size(indices)) * 2 - 1 |
|
|
|
|
|
components = sp.csr_matrix( |
|
(data, indices, indptr), shape=(n_components, n_features) |
|
) |
|
|
|
return np.sqrt(1 / density) / np.sqrt(n_components) * components |
|
|
|
|
|
class BaseRandomProjection( |
|
ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator, metaclass=ABCMeta |
|
): |
|
"""Base class for random projections. |
|
|
|
Warning: This class should not be used directly. |
|
Use derived classes instead. |
|
""" |
|
|
|
_parameter_constraints: dict = { |
|
"n_components": [ |
|
Interval(Integral, 1, None, closed="left"), |
|
StrOptions({"auto"}), |
|
], |
|
"eps": [Interval(Real, 0, None, closed="neither")], |
|
"compute_inverse_components": ["boolean"], |
|
"random_state": ["random_state"], |
|
} |
|
|
|
@abstractmethod |
|
def __init__( |
|
self, |
|
n_components="auto", |
|
*, |
|
eps=0.1, |
|
compute_inverse_components=False, |
|
random_state=None, |
|
): |
|
self.n_components = n_components |
|
self.eps = eps |
|
self.compute_inverse_components = compute_inverse_components |
|
self.random_state = random_state |
|
|
|
@abstractmethod |
|
def _make_random_matrix(self, n_components, n_features): |
|
"""Generate the random projection matrix. |
|
|
|
Parameters |
|
---------- |
|
n_components : int, |
|
Dimensionality of the target projection space. |
|
|
|
n_features : int, |
|
Dimensionality of the original source space. |
|
|
|
Returns |
|
------- |
|
components : {ndarray, sparse matrix} of shape (n_components, n_features) |
|
The generated random matrix. Sparse matrix will be of CSR format. |
|
|
|
""" |
|
|
|
def _compute_inverse_components(self): |
|
"""Compute the pseudo-inverse of the (densified) components.""" |
|
components = self.components_ |
|
if sp.issparse(components): |
|
components = components.toarray() |
|
return linalg.pinv(components, check_finite=False) |
|
|
|
@_fit_context(prefer_skip_nested_validation=True) |
|
def fit(self, X, y=None): |
|
"""Generate a sparse random projection matrix. |
|
|
|
Parameters |
|
---------- |
|
X : {ndarray, sparse matrix} of shape (n_samples, n_features) |
|
Training set: only the shape is used to find optimal random |
|
matrix dimensions based on the theory referenced in the |
|
afore mentioned papers. |
|
|
|
y : Ignored |
|
Not used, present here for API consistency by convention. |
|
|
|
Returns |
|
------- |
|
self : object |
|
BaseRandomProjection class instance. |
|
""" |
|
X = validate_data( |
|
self, X, accept_sparse=["csr", "csc"], dtype=[np.float64, np.float32] |
|
) |
|
|
|
n_samples, n_features = X.shape |
|
|
|
if self.n_components == "auto": |
|
self.n_components_ = johnson_lindenstrauss_min_dim( |
|
n_samples=n_samples, eps=self.eps |
|
) |
|
|
|
if self.n_components_ <= 0: |
|
raise ValueError( |
|
"eps=%f and n_samples=%d lead to a target dimension of " |
|
"%d which is invalid" % (self.eps, n_samples, self.n_components_) |
|
) |
|
|
|
elif self.n_components_ > n_features: |
|
raise ValueError( |
|
"eps=%f and n_samples=%d lead to a target dimension of " |
|
"%d which is larger than the original space with " |
|
"n_features=%d" |
|
% (self.eps, n_samples, self.n_components_, n_features) |
|
) |
|
else: |
|
if self.n_components > n_features: |
|
warnings.warn( |
|
"The number of components is higher than the number of" |
|
" features: n_features < n_components (%s < %s)." |
|
"The dimensionality of the problem will not be reduced." |
|
% (n_features, self.n_components), |
|
DataDimensionalityWarning, |
|
) |
|
|
|
self.n_components_ = self.n_components |
|
|
|
|
|
self.components_ = self._make_random_matrix( |
|
self.n_components_, n_features |
|
).astype(X.dtype, copy=False) |
|
|
|
if self.compute_inverse_components: |
|
self.inverse_components_ = self._compute_inverse_components() |
|
|
|
|
|
self._n_features_out = self.n_components |
|
|
|
return self |
|
|
|
def inverse_transform(self, X): |
|
"""Project data back to its original space. |
|
|
|
Returns an array X_original whose transform would be X. Note that even |
|
if X is sparse, X_original is dense: this may use a lot of RAM. |
|
|
|
If `compute_inverse_components` is False, the inverse of the components is |
|
computed during each call to `inverse_transform` which can be costly. |
|
|
|
Parameters |
|
---------- |
|
X : {array-like, sparse matrix} of shape (n_samples, n_components) |
|
Data to be transformed back. |
|
|
|
Returns |
|
------- |
|
X_original : ndarray of shape (n_samples, n_features) |
|
Reconstructed data. |
|
""" |
|
check_is_fitted(self) |
|
|
|
X = check_array(X, dtype=[np.float64, np.float32], accept_sparse=("csr", "csc")) |
|
|
|
if self.compute_inverse_components: |
|
return X @ self.inverse_components_.T |
|
|
|
inverse_components = self._compute_inverse_components() |
|
return X @ inverse_components.T |
|
|
|
def __sklearn_tags__(self): |
|
tags = super().__sklearn_tags__() |
|
tags.transformer_tags.preserves_dtype = ["float64", "float32"] |
|
tags.input_tags.sparse = True |
|
return tags |
|
|
|
|
|
class GaussianRandomProjection(BaseRandomProjection): |
|
"""Reduce dimensionality through Gaussian random projection. |
|
|
|
The components of the random matrix are drawn from N(0, 1 / n_components). |
|
|
|
Read more in the :ref:`User Guide <gaussian_random_matrix>`. |
|
|
|
.. versionadded:: 0.13 |
|
|
|
Parameters |
|
---------- |
|
n_components : int or 'auto', default='auto' |
|
Dimensionality of the target projection space. |
|
|
|
n_components can be automatically adjusted according to the |
|
number of samples in the dataset and the bound given by the |
|
Johnson-Lindenstrauss lemma. In that case the quality of the |
|
embedding is controlled by the ``eps`` parameter. |
|
|
|
It should be noted that Johnson-Lindenstrauss lemma can yield |
|
very conservative estimated of the required number of components |
|
as it makes no assumption on the structure of the dataset. |
|
|
|
eps : float, default=0.1 |
|
Parameter to control the quality of the embedding according to |
|
the Johnson-Lindenstrauss lemma when `n_components` is set to |
|
'auto'. The value should be strictly positive. |
|
|
|
Smaller values lead to better embedding and higher number of |
|
dimensions (n_components) in the target projection space. |
|
|
|
compute_inverse_components : bool, default=False |
|
Learn the inverse transform by computing the pseudo-inverse of the |
|
components during fit. Note that computing the pseudo-inverse does not |
|
scale well to large matrices. |
|
|
|
random_state : int, RandomState instance or None, default=None |
|
Controls the pseudo random number generator used to generate the |
|
projection matrix at fit time. |
|
Pass an int for reproducible output across multiple function calls. |
|
See :term:`Glossary <random_state>`. |
|
|
|
Attributes |
|
---------- |
|
n_components_ : int |
|
Concrete number of components computed when n_components="auto". |
|
|
|
components_ : ndarray of shape (n_components, n_features) |
|
Random matrix used for the projection. |
|
|
|
inverse_components_ : ndarray of shape (n_features, n_components) |
|
Pseudo-inverse of the components, only computed if |
|
`compute_inverse_components` is True. |
|
|
|
.. versionadded:: 1.1 |
|
|
|
n_features_in_ : int |
|
Number of features seen during :term:`fit`. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
feature_names_in_ : ndarray of shape (`n_features_in_`,) |
|
Names of features seen during :term:`fit`. Defined only when `X` |
|
has feature names that are all strings. |
|
|
|
.. versionadded:: 1.0 |
|
|
|
See Also |
|
-------- |
|
SparseRandomProjection : Reduce dimensionality through sparse |
|
random projection. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.random_projection import GaussianRandomProjection |
|
>>> rng = np.random.RandomState(42) |
|
>>> X = rng.rand(25, 3000) |
|
>>> transformer = GaussianRandomProjection(random_state=rng) |
|
>>> X_new = transformer.fit_transform(X) |
|
>>> X_new.shape |
|
(25, 2759) |
|
""" |
|
|
|
def __init__( |
|
self, |
|
n_components="auto", |
|
*, |
|
eps=0.1, |
|
compute_inverse_components=False, |
|
random_state=None, |
|
): |
|
super().__init__( |
|
n_components=n_components, |
|
eps=eps, |
|
compute_inverse_components=compute_inverse_components, |
|
random_state=random_state, |
|
) |
|
|
|
def _make_random_matrix(self, n_components, n_features): |
|
"""Generate the random projection matrix. |
|
|
|
Parameters |
|
---------- |
|
n_components : int, |
|
Dimensionality of the target projection space. |
|
|
|
n_features : int, |
|
Dimensionality of the original source space. |
|
|
|
Returns |
|
------- |
|
components : ndarray of shape (n_components, n_features) |
|
The generated random matrix. |
|
""" |
|
random_state = check_random_state(self.random_state) |
|
return _gaussian_random_matrix( |
|
n_components, n_features, random_state=random_state |
|
) |
|
|
|
def transform(self, X): |
|
"""Project the data by using matrix product with the random matrix. |
|
|
|
Parameters |
|
---------- |
|
X : {ndarray, sparse matrix} of shape (n_samples, n_features) |
|
The input data to project into a smaller dimensional space. |
|
|
|
Returns |
|
------- |
|
X_new : ndarray of shape (n_samples, n_components) |
|
Projected array. |
|
""" |
|
check_is_fitted(self) |
|
X = validate_data( |
|
self, |
|
X, |
|
accept_sparse=["csr", "csc"], |
|
reset=False, |
|
dtype=[np.float64, np.float32], |
|
) |
|
|
|
return X @ self.components_.T |
|
|
|
|
|
class SparseRandomProjection(BaseRandomProjection): |
|
"""Reduce dimensionality through sparse random projection. |
|
|
|
Sparse random matrix is an alternative to dense random |
|
projection matrix that guarantees similar embedding quality while being |
|
much more memory efficient and allowing faster computation of the |
|
projected data. |
|
|
|
If we note `s = 1 / density` the components of the random matrix are |
|
drawn from: |
|
|
|
.. code-block:: text |
|
|
|
-sqrt(s) / sqrt(n_components) with probability 1 / 2s |
|
0 with probability 1 - 1 / s |
|
+sqrt(s) / sqrt(n_components) with probability 1 / 2s |
|
|
|
Read more in the :ref:`User Guide <sparse_random_matrix>`. |
|
|
|
.. versionadded:: 0.13 |
|
|
|
Parameters |
|
---------- |
|
n_components : int or 'auto', default='auto' |
|
Dimensionality of the target projection space. |
|
|
|
n_components can be automatically adjusted according to the |
|
number of samples in the dataset and the bound given by the |
|
Johnson-Lindenstrauss lemma. In that case the quality of the |
|
embedding is controlled by the ``eps`` parameter. |
|
|
|
It should be noted that Johnson-Lindenstrauss lemma can yield |
|
very conservative estimated of the required number of components |
|
as it makes no assumption on the structure of the dataset. |
|
|
|
density : float or 'auto', default='auto' |
|
Ratio in the range (0, 1] of non-zero component in the random |
|
projection matrix. |
|
|
|
If density = 'auto', the value is set to the minimum density |
|
as recommended by Ping Li et al.: 1 / sqrt(n_features). |
|
|
|
Use density = 1 / 3.0 if you want to reproduce the results from |
|
Achlioptas, 2001. |
|
|
|
eps : float, default=0.1 |
|
Parameter to control the quality of the embedding according to |
|
the Johnson-Lindenstrauss lemma when n_components is set to |
|
'auto'. This value should be strictly positive. |
|
|
|
Smaller values lead to better embedding and higher number of |
|
dimensions (n_components) in the target projection space. |
|
|
|
dense_output : bool, default=False |
|
If True, ensure that the output of the random projection is a |
|
dense numpy array even if the input and random projection matrix |
|
are both sparse. In practice, if the number of components is |
|
small the number of zero components in the projected data will |
|
be very small and it will be more CPU and memory efficient to |
|
use a dense representation. |
|
|
|
If False, the projected data uses a sparse representation if |
|
the input is sparse. |
|
|
|
compute_inverse_components : bool, default=False |
|
Learn the inverse transform by computing the pseudo-inverse of the |
|
components during fit. Note that the pseudo-inverse is always a dense |
|
array, even if the training data was sparse. This means that it might be |
|
necessary to call `inverse_transform` on a small batch of samples at a |
|
time to avoid exhausting the available memory on the host. Moreover, |
|
computing the pseudo-inverse does not scale well to large matrices. |
|
|
|
random_state : int, RandomState instance or None, default=None |
|
Controls the pseudo random number generator used to generate the |
|
projection matrix at fit time. |
|
Pass an int for reproducible output across multiple function calls. |
|
See :term:`Glossary <random_state>`. |
|
|
|
Attributes |
|
---------- |
|
n_components_ : int |
|
Concrete number of components computed when n_components="auto". |
|
|
|
components_ : sparse matrix of shape (n_components, n_features) |
|
Random matrix used for the projection. Sparse matrix will be of CSR |
|
format. |
|
|
|
inverse_components_ : ndarray of shape (n_features, n_components) |
|
Pseudo-inverse of the components, only computed if |
|
`compute_inverse_components` is True. |
|
|
|
.. versionadded:: 1.1 |
|
|
|
density_ : float in range 0.0 - 1.0 |
|
Concrete density computed from when density = "auto". |
|
|
|
n_features_in_ : int |
|
Number of features seen during :term:`fit`. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
feature_names_in_ : ndarray of shape (`n_features_in_`,) |
|
Names of features seen during :term:`fit`. Defined only when `X` |
|
has feature names that are all strings. |
|
|
|
.. versionadded:: 1.0 |
|
|
|
See Also |
|
-------- |
|
GaussianRandomProjection : Reduce dimensionality through Gaussian |
|
random projection. |
|
|
|
References |
|
---------- |
|
|
|
.. [1] Ping Li, T. Hastie and K. W. Church, 2006, |
|
"Very Sparse Random Projections". |
|
https://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf |
|
|
|
.. [2] D. Achlioptas, 2001, "Database-friendly random projections", |
|
https://cgi.di.uoa.gr/~optas/papers/jl.pdf |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.random_projection import SparseRandomProjection |
|
>>> rng = np.random.RandomState(42) |
|
>>> X = rng.rand(25, 3000) |
|
>>> transformer = SparseRandomProjection(random_state=rng) |
|
>>> X_new = transformer.fit_transform(X) |
|
>>> X_new.shape |
|
(25, 2759) |
|
>>> # very few components are non-zero |
|
>>> np.mean(transformer.components_ != 0) |
|
np.float64(0.0182...) |
|
""" |
|
|
|
_parameter_constraints: dict = { |
|
**BaseRandomProjection._parameter_constraints, |
|
"density": [Interval(Real, 0.0, 1.0, closed="right"), StrOptions({"auto"})], |
|
"dense_output": ["boolean"], |
|
} |
|
|
|
def __init__( |
|
self, |
|
n_components="auto", |
|
*, |
|
density="auto", |
|
eps=0.1, |
|
dense_output=False, |
|
compute_inverse_components=False, |
|
random_state=None, |
|
): |
|
super().__init__( |
|
n_components=n_components, |
|
eps=eps, |
|
compute_inverse_components=compute_inverse_components, |
|
random_state=random_state, |
|
) |
|
|
|
self.dense_output = dense_output |
|
self.density = density |
|
|
|
def _make_random_matrix(self, n_components, n_features): |
|
"""Generate the random projection matrix |
|
|
|
Parameters |
|
---------- |
|
n_components : int |
|
Dimensionality of the target projection space. |
|
|
|
n_features : int |
|
Dimensionality of the original source space. |
|
|
|
Returns |
|
------- |
|
components : sparse matrix of shape (n_components, n_features) |
|
The generated random matrix in CSR format. |
|
|
|
""" |
|
random_state = check_random_state(self.random_state) |
|
self.density_ = _check_density(self.density, n_features) |
|
return _sparse_random_matrix( |
|
n_components, n_features, density=self.density_, random_state=random_state |
|
) |
|
|
|
def transform(self, X): |
|
"""Project the data by using matrix product with the random matrix. |
|
|
|
Parameters |
|
---------- |
|
X : {ndarray, sparse matrix} of shape (n_samples, n_features) |
|
The input data to project into a smaller dimensional space. |
|
|
|
Returns |
|
------- |
|
X_new : {ndarray, sparse matrix} of shape (n_samples, n_components) |
|
Projected array. It is a sparse matrix only when the input is sparse and |
|
`dense_output = False`. |
|
""" |
|
check_is_fitted(self) |
|
X = validate_data( |
|
self, |
|
X, |
|
accept_sparse=["csr", "csc"], |
|
reset=False, |
|
dtype=[np.float64, np.float32], |
|
) |
|
|
|
return safe_sparse_dot(X, self.components_.T, dense_output=self.dense_output) |
|
|