|
"""Utilities to work with sparse matrices and arrays written in Cython.""" |
|
|
|
|
|
|
|
|
|
from libc.math cimport fabs, sqrt, isnan |
|
from libc.stdint cimport intptr_t |
|
|
|
import numpy as np |
|
from cython cimport floating |
|
from ..utils._typedefs cimport float64_t, int32_t, int64_t, intp_t, uint64_t |
|
|
|
|
|
ctypedef fused integral: |
|
int32_t |
|
int64_t |
|
|
|
|
|
def csr_row_norms(X): |
|
"""Squared L2 norm of each row in CSR matrix X.""" |
|
if X.dtype not in [np.float32, np.float64]: |
|
X = X.astype(np.float64) |
|
return _sqeuclidean_row_norms_sparse(X.data, X.indptr) |
|
|
|
|
|
def _sqeuclidean_row_norms_sparse( |
|
const floating[::1] X_data, |
|
const integral[::1] X_indptr, |
|
): |
|
cdef: |
|
integral n_samples = X_indptr.shape[0] - 1 |
|
integral i, j |
|
|
|
dtype = np.float32 if floating is float else np.float64 |
|
|
|
cdef floating[::1] squared_row_norms = np.zeros(n_samples, dtype=dtype) |
|
|
|
with nogil: |
|
for i in range(n_samples): |
|
for j in range(X_indptr[i], X_indptr[i + 1]): |
|
squared_row_norms[i] += X_data[j] * X_data[j] |
|
|
|
return np.asarray(squared_row_norms) |
|
|
|
|
|
def csr_mean_variance_axis0(X, weights=None, return_sum_weights=False): |
|
"""Compute mean and variance along axis 0 on a CSR matrix |
|
|
|
Uses a np.float64 accumulator. |
|
|
|
Parameters |
|
---------- |
|
X : CSR sparse matrix, shape (n_samples, n_features) |
|
Input data. |
|
|
|
weights : ndarray of shape (n_samples,), dtype=floating, default=None |
|
If it is set to None samples will be equally weighted. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
return_sum_weights : bool, default=False |
|
If True, returns the sum of weights seen for each feature. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
Returns |
|
------- |
|
means : float array with shape (n_features,) |
|
Feature-wise means |
|
|
|
variances : float array with shape (n_features,) |
|
Feature-wise variances |
|
|
|
sum_weights : ndarray of shape (n_features,), dtype=floating |
|
Returned if return_sum_weights is True. |
|
""" |
|
if X.dtype not in [np.float32, np.float64]: |
|
X = X.astype(np.float64) |
|
|
|
if weights is None: |
|
weights = np.ones(X.shape[0], dtype=X.dtype) |
|
|
|
means, variances, sum_weights = _csr_mean_variance_axis0( |
|
X.data, X.shape[0], X.shape[1], X.indices, X.indptr, weights) |
|
|
|
if return_sum_weights: |
|
return means, variances, sum_weights |
|
return means, variances |
|
|
|
|
|
def _csr_mean_variance_axis0( |
|
const floating[::1] X_data, |
|
uint64_t n_samples, |
|
uint64_t n_features, |
|
const integral[:] X_indices, |
|
const integral[:] X_indptr, |
|
const floating[:] weights, |
|
): |
|
|
|
|
|
cdef: |
|
intp_t row_ind |
|
uint64_t feature_idx |
|
integral i, col_ind |
|
float64_t diff |
|
|
|
float64_t[::1] means = np.zeros(n_features) |
|
|
|
float64_t[::1] variances = np.zeros(n_features) |
|
|
|
float64_t[::1] sum_weights = np.full( |
|
fill_value=np.sum(weights, dtype=np.float64), shape=n_features |
|
) |
|
float64_t[::1] sum_weights_nz = np.zeros(shape=n_features) |
|
float64_t[::1] correction = np.zeros(shape=n_features) |
|
|
|
uint64_t[::1] counts = np.full( |
|
fill_value=weights.shape[0], shape=n_features, dtype=np.uint64 |
|
) |
|
uint64_t[::1] counts_nz = np.zeros(shape=n_features, dtype=np.uint64) |
|
|
|
for row_ind in range(len(X_indptr) - 1): |
|
for i in range(X_indptr[row_ind], X_indptr[row_ind + 1]): |
|
col_ind = X_indices[i] |
|
if not isnan(X_data[i]): |
|
means[col_ind] += <float64_t>(X_data[i]) * weights[row_ind] |
|
|
|
sum_weights_nz[col_ind] += weights[row_ind] |
|
|
|
counts_nz[col_ind] += 1 |
|
else: |
|
|
|
sum_weights[col_ind] -= weights[row_ind] |
|
|
|
counts[col_ind] -= 1 |
|
|
|
for feature_idx in range(n_features): |
|
means[feature_idx] /= sum_weights[feature_idx] |
|
|
|
for row_ind in range(len(X_indptr) - 1): |
|
for i in range(X_indptr[row_ind], X_indptr[row_ind + 1]): |
|
col_ind = X_indices[i] |
|
if not isnan(X_data[i]): |
|
diff = X_data[i] - means[col_ind] |
|
|
|
|
|
|
|
correction[col_ind] += diff * weights[row_ind] |
|
variances[col_ind] += diff * diff * weights[row_ind] |
|
|
|
for feature_idx in range(n_features): |
|
if counts[feature_idx] != counts_nz[feature_idx]: |
|
correction[feature_idx] -= ( |
|
sum_weights[feature_idx] - sum_weights_nz[feature_idx] |
|
) * means[feature_idx] |
|
correction[feature_idx] = correction[feature_idx]**2 / sum_weights[feature_idx] |
|
if counts[feature_idx] != counts_nz[feature_idx]: |
|
|
|
|
|
variances[feature_idx] += ( |
|
sum_weights[feature_idx] - sum_weights_nz[feature_idx] |
|
) * means[feature_idx]**2 |
|
variances[feature_idx] = ( |
|
(variances[feature_idx] - correction[feature_idx]) / |
|
sum_weights[feature_idx] |
|
) |
|
|
|
if floating is float: |
|
return ( |
|
np.array(means, dtype=np.float32), |
|
np.array(variances, dtype=np.float32), |
|
np.array(sum_weights, dtype=np.float32), |
|
) |
|
else: |
|
return ( |
|
np.asarray(means), np.asarray(variances), np.asarray(sum_weights) |
|
) |
|
|
|
|
|
def csc_mean_variance_axis0(X, weights=None, return_sum_weights=False): |
|
"""Compute mean and variance along axis 0 on a CSC matrix |
|
|
|
Uses a np.float64 accumulator. |
|
|
|
Parameters |
|
---------- |
|
X : CSC sparse matrix, shape (n_samples, n_features) |
|
Input data. |
|
|
|
weights : ndarray of shape (n_samples,), dtype=floating, default=None |
|
If it is set to None samples will be equally weighted. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
return_sum_weights : bool, default=False |
|
If True, returns the sum of weights seen for each feature. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
Returns |
|
------- |
|
means : float array with shape (n_features,) |
|
Feature-wise means |
|
|
|
variances : float array with shape (n_features,) |
|
Feature-wise variances |
|
|
|
sum_weights : ndarray of shape (n_features,), dtype=floating |
|
Returned if return_sum_weights is True. |
|
""" |
|
if X.dtype not in [np.float32, np.float64]: |
|
X = X.astype(np.float64) |
|
|
|
if weights is None: |
|
weights = np.ones(X.shape[0], dtype=X.dtype) |
|
|
|
means, variances, sum_weights = _csc_mean_variance_axis0( |
|
X.data, X.shape[0], X.shape[1], X.indices, X.indptr, weights) |
|
|
|
if return_sum_weights: |
|
return means, variances, sum_weights |
|
return means, variances |
|
|
|
|
|
def _csc_mean_variance_axis0( |
|
const floating[::1] X_data, |
|
uint64_t n_samples, |
|
uint64_t n_features, |
|
const integral[:] X_indices, |
|
const integral[:] X_indptr, |
|
const floating[:] weights, |
|
): |
|
|
|
|
|
cdef: |
|
integral i, row_ind |
|
uint64_t feature_idx, col_ind |
|
float64_t diff |
|
|
|
float64_t[::1] means = np.zeros(n_features) |
|
|
|
float64_t[::1] variances = np.zeros(n_features) |
|
|
|
float64_t[::1] sum_weights = np.full( |
|
fill_value=np.sum(weights, dtype=np.float64), shape=n_features |
|
) |
|
float64_t[::1] sum_weights_nz = np.zeros(shape=n_features) |
|
float64_t[::1] correction = np.zeros(shape=n_features) |
|
|
|
uint64_t[::1] counts = np.full( |
|
fill_value=weights.shape[0], shape=n_features, dtype=np.uint64 |
|
) |
|
uint64_t[::1] counts_nz = np.zeros(shape=n_features, dtype=np.uint64) |
|
|
|
for col_ind in range(n_features): |
|
for i in range(X_indptr[col_ind], X_indptr[col_ind + 1]): |
|
row_ind = X_indices[i] |
|
if not isnan(X_data[i]): |
|
means[col_ind] += <float64_t>(X_data[i]) * weights[row_ind] |
|
|
|
sum_weights_nz[col_ind] += weights[row_ind] |
|
|
|
counts_nz[col_ind] += 1 |
|
else: |
|
|
|
sum_weights[col_ind] -= weights[row_ind] |
|
|
|
counts[col_ind] -= 1 |
|
|
|
for feature_idx in range(n_features): |
|
means[feature_idx] /= sum_weights[feature_idx] |
|
|
|
for col_ind in range(n_features): |
|
for i in range(X_indptr[col_ind], X_indptr[col_ind + 1]): |
|
row_ind = X_indices[i] |
|
if not isnan(X_data[i]): |
|
diff = X_data[i] - means[col_ind] |
|
|
|
|
|
|
|
correction[col_ind] += diff * weights[row_ind] |
|
variances[col_ind] += diff * diff * weights[row_ind] |
|
|
|
for feature_idx in range(n_features): |
|
if counts[feature_idx] != counts_nz[feature_idx]: |
|
correction[feature_idx] -= ( |
|
sum_weights[feature_idx] - sum_weights_nz[feature_idx] |
|
) * means[feature_idx] |
|
correction[feature_idx] = correction[feature_idx]**2 / sum_weights[feature_idx] |
|
if counts[feature_idx] != counts_nz[feature_idx]: |
|
|
|
|
|
variances[feature_idx] += ( |
|
sum_weights[feature_idx] - sum_weights_nz[feature_idx] |
|
) * means[feature_idx]**2 |
|
variances[feature_idx] = ( |
|
(variances[feature_idx] - correction[feature_idx]) |
|
) / sum_weights[feature_idx] |
|
|
|
if floating is float: |
|
return (np.array(means, dtype=np.float32), |
|
np.array(variances, dtype=np.float32), |
|
np.array(sum_weights, dtype=np.float32)) |
|
else: |
|
return ( |
|
np.asarray(means), np.asarray(variances), np.asarray(sum_weights) |
|
) |
|
|
|
|
|
def incr_mean_variance_axis0(X, last_mean, last_var, last_n, weights=None): |
|
"""Compute mean and variance along axis 0 on a CSR or CSC matrix. |
|
|
|
last_mean, last_var are the statistics computed at the last step by this |
|
function. Both must be initialized to 0.0. last_n is the |
|
number of samples encountered until now and is initialized at 0. |
|
|
|
Parameters |
|
---------- |
|
X : CSR or CSC sparse matrix, shape (n_samples, n_features) |
|
Input data. |
|
|
|
last_mean : float array with shape (n_features,) |
|
Array of feature-wise means to update with the new data X. |
|
|
|
last_var : float array with shape (n_features,) |
|
Array of feature-wise var to update with the new data X. |
|
|
|
last_n : float array with shape (n_features,) |
|
Sum of the weights seen so far (if weights are all set to 1 |
|
this will be the same as number of samples seen so far, before X). |
|
|
|
weights : float array with shape (n_samples,) or None. If it is set |
|
to None samples will be equally weighted. |
|
|
|
Returns |
|
------- |
|
updated_mean : float array with shape (n_features,) |
|
Feature-wise means |
|
|
|
updated_variance : float array with shape (n_features,) |
|
Feature-wise variances |
|
|
|
updated_n : int array with shape (n_features,) |
|
Updated number of samples seen |
|
|
|
Notes |
|
----- |
|
NaNs are ignored during the computation. |
|
|
|
References |
|
---------- |
|
T. Chan, G. Golub, R. LeVeque. Algorithms for computing the sample |
|
variance: recommendations, The American Statistician, Vol. 37, No. 3, |
|
pp. 242-247 |
|
|
|
Also, see the non-sparse implementation of this in |
|
`utils.extmath._batch_mean_variance_update`. |
|
|
|
""" |
|
if X.dtype not in [np.float32, np.float64]: |
|
X = X.astype(np.float64) |
|
X_dtype = X.dtype |
|
if weights is None: |
|
weights = np.ones(X.shape[0], dtype=X_dtype) |
|
elif weights.dtype not in [np.float32, np.float64]: |
|
weights = weights.astype(np.float64, copy=False) |
|
if last_n.dtype not in [np.float32, np.float64]: |
|
last_n = last_n.astype(np.float64, copy=False) |
|
|
|
return _incr_mean_variance_axis0(X.data, |
|
np.sum(weights), |
|
X.shape[1], |
|
X.indices, |
|
X.indptr, |
|
X.format, |
|
last_mean.astype(X_dtype, copy=False), |
|
last_var.astype(X_dtype, copy=False), |
|
last_n.astype(X_dtype, copy=False), |
|
weights.astype(X_dtype, copy=False)) |
|
|
|
|
|
def _incr_mean_variance_axis0( |
|
const floating[:] X_data, |
|
floating n_samples, |
|
uint64_t n_features, |
|
const int[:] X_indices, |
|
|
|
const integral[:] X_indptr, |
|
str X_format, |
|
floating[:] last_mean, |
|
floating[:] last_var, |
|
floating[:] last_n, |
|
|
|
const floating[:] weights, |
|
): |
|
|
|
|
|
cdef: |
|
uint64_t i |
|
|
|
|
|
|
|
|
|
|
|
floating[::1] new_mean |
|
floating[::1] new_var |
|
floating[::1] updated_mean |
|
floating[::1] updated_var |
|
|
|
if floating is float: |
|
dtype = np.float32 |
|
else: |
|
dtype = np.float64 |
|
|
|
new_mean = np.zeros(n_features, dtype=dtype) |
|
new_var = np.zeros_like(new_mean, dtype=dtype) |
|
updated_mean = np.zeros_like(new_mean, dtype=dtype) |
|
updated_var = np.zeros_like(new_mean, dtype=dtype) |
|
|
|
cdef: |
|
floating[::1] new_n |
|
floating[::1] updated_n |
|
floating[::1] last_over_new_n |
|
|
|
|
|
updated_n = np.zeros(shape=n_features, dtype=dtype) |
|
last_over_new_n = np.zeros_like(updated_n, dtype=dtype) |
|
|
|
|
|
if X_format == 'csr': |
|
new_mean, new_var, new_n = _csr_mean_variance_axis0( |
|
X_data, n_samples, n_features, X_indices, X_indptr, weights) |
|
else: |
|
new_mean, new_var, new_n = _csc_mean_variance_axis0( |
|
X_data, n_samples, n_features, X_indices, X_indptr, weights) |
|
|
|
|
|
cdef bint is_first_pass = True |
|
for i in range(n_features): |
|
if last_n[i] > 0: |
|
is_first_pass = False |
|
break |
|
|
|
if is_first_pass: |
|
return np.asarray(new_mean), np.asarray(new_var), np.asarray(new_n) |
|
|
|
for i in range(n_features): |
|
updated_n[i] = last_n[i] + new_n[i] |
|
|
|
|
|
for i in range(n_features): |
|
if new_n[i] > 0: |
|
last_over_new_n[i] = dtype(last_n[i]) / dtype(new_n[i]) |
|
|
|
last_mean[i] *= last_n[i] |
|
last_var[i] *= last_n[i] |
|
new_mean[i] *= new_n[i] |
|
new_var[i] *= new_n[i] |
|
|
|
updated_var[i] = ( |
|
last_var[i] + new_var[i] + |
|
last_over_new_n[i] / updated_n[i] * |
|
(last_mean[i] / last_over_new_n[i] - new_mean[i])**2 |
|
) |
|
updated_mean[i] = (last_mean[i] + new_mean[i]) / updated_n[i] |
|
updated_var[i] /= updated_n[i] |
|
else: |
|
updated_var[i] = last_var[i] |
|
updated_mean[i] = last_mean[i] |
|
updated_n[i] = last_n[i] |
|
|
|
return ( |
|
np.asarray(updated_mean), |
|
np.asarray(updated_var), |
|
np.asarray(updated_n), |
|
) |
|
|
|
|
|
def inplace_csr_row_normalize_l1(X): |
|
"""Normalize inplace the rows of a CSR matrix or array by their L1 norm. |
|
|
|
Parameters |
|
---------- |
|
X : scipy.sparse.csr_matrix and scipy.sparse.csr_array, \ |
|
shape=(n_samples, n_features) |
|
The input matrix or array to be modified inplace. |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.sparse import csr_matrix |
|
>>> from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l1 |
|
>>> import numpy as np |
|
>>> indptr = np.array([0, 2, 3, 4]) |
|
>>> indices = np.array([0, 1, 2, 3]) |
|
>>> data = np.array([1.0, 2.0, 3.0, 4.0]) |
|
>>> X = csr_matrix((data, indices, indptr), shape=(3, 4)) |
|
>>> X.toarray() |
|
array([[1., 2., 0., 0.], |
|
[0., 0., 3., 0.], |
|
[0., 0., 0., 4.]]) |
|
>>> inplace_csr_row_normalize_l1(X) |
|
>>> X.toarray() |
|
array([[0.33... , 0.66... , 0. , 0. ], |
|
[0. , 0. , 1. , 0. ], |
|
[0. , 0. , 0. , 1. ]]) |
|
""" |
|
_inplace_csr_row_normalize_l1(X.data, X.shape, X.indices, X.indptr) |
|
|
|
|
|
def _inplace_csr_row_normalize_l1( |
|
floating[:] X_data, |
|
shape, |
|
const integral[:] X_indices, |
|
const integral[:] X_indptr, |
|
): |
|
cdef: |
|
uint64_t n_samples = shape[0] |
|
|
|
|
|
|
|
|
|
|
|
uint64_t i |
|
integral j |
|
double sum_ |
|
|
|
for i in range(n_samples): |
|
sum_ = 0.0 |
|
|
|
for j in range(X_indptr[i], X_indptr[i + 1]): |
|
sum_ += fabs(X_data[j]) |
|
|
|
if sum_ == 0.0: |
|
|
|
|
|
continue |
|
|
|
for j in range(X_indptr[i], X_indptr[i + 1]): |
|
X_data[j] /= sum_ |
|
|
|
|
|
def inplace_csr_row_normalize_l2(X): |
|
"""Normalize inplace the rows of a CSR matrix or array by their L2 norm. |
|
|
|
Parameters |
|
---------- |
|
X : scipy.sparse.csr_matrix, shape=(n_samples, n_features) |
|
The input matrix or array to be modified inplace. |
|
|
|
Examples |
|
-------- |
|
>>> from scipy.sparse import csr_matrix |
|
>>> from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l2 |
|
>>> import numpy as np |
|
>>> indptr = np.array([0, 2, 3, 4]) |
|
>>> indices = np.array([0, 1, 2, 3]) |
|
>>> data = np.array([1.0, 2.0, 3.0, 4.0]) |
|
>>> X = csr_matrix((data, indices, indptr), shape=(3, 4)) |
|
>>> X.toarray() |
|
array([[1., 2., 0., 0.], |
|
[0., 0., 3., 0.], |
|
[0., 0., 0., 4.]]) |
|
>>> inplace_csr_row_normalize_l2(X) |
|
>>> X.toarray() |
|
array([[0.44... , 0.89... , 0. , 0. ], |
|
[0. , 0. , 1. , 0. ], |
|
[0. , 0. , 0. , 1. ]]) |
|
""" |
|
_inplace_csr_row_normalize_l2(X.data, X.shape, X.indices, X.indptr) |
|
|
|
|
|
def _inplace_csr_row_normalize_l2( |
|
floating[:] X_data, |
|
shape, |
|
const integral[:] X_indices, |
|
const integral[:] X_indptr, |
|
): |
|
cdef: |
|
uint64_t n_samples = shape[0] |
|
uint64_t i |
|
integral j |
|
double sum_ |
|
|
|
for i in range(n_samples): |
|
sum_ = 0.0 |
|
|
|
for j in range(X_indptr[i], X_indptr[i + 1]): |
|
sum_ += (X_data[j] * X_data[j]) |
|
|
|
if sum_ == 0.0: |
|
|
|
|
|
continue |
|
|
|
sum_ = sqrt(sum_) |
|
|
|
for j in range(X_indptr[i], X_indptr[i + 1]): |
|
X_data[j] /= sum_ |
|
|
|
|
|
def assign_rows_csr( |
|
X, |
|
const intptr_t[:] X_rows, |
|
const intptr_t[:] out_rows, |
|
floating[:, ::1] out, |
|
): |
|
"""Densify selected rows of a CSR matrix into a preallocated array. |
|
|
|
Like out[out_rows] = X[X_rows].toarray() but without copying. |
|
No-copy supported for both dtype=np.float32 and dtype=np.float64. |
|
|
|
Parameters |
|
---------- |
|
X : scipy.sparse.csr_matrix, shape=(n_samples, n_features) |
|
X_rows : array, dtype=np.intp, shape=n_rows |
|
out_rows : array, dtype=np.intp, shape=n_rows |
|
out : array, shape=(arbitrary, n_features) |
|
""" |
|
cdef: |
|
|
|
|
|
intp_t i, ind, j, k |
|
intptr_t rX |
|
const floating[:] data = X.data |
|
const int32_t[:] indices = X.indices |
|
const int32_t[:] indptr = X.indptr |
|
|
|
if X_rows.shape[0] != out_rows.shape[0]: |
|
raise ValueError("cannot assign %d rows to %d" |
|
% (X_rows.shape[0], out_rows.shape[0])) |
|
|
|
with nogil: |
|
for k in range(out_rows.shape[0]): |
|
out[out_rows[k]] = 0.0 |
|
|
|
for i in range(X_rows.shape[0]): |
|
rX = X_rows[i] |
|
for ind in range(indptr[rX], indptr[rX + 1]): |
|
j = indices[ind] |
|
out[out_rows[i], j] = data[ind] |
|
|