spam-classifier / venv /lib /python3.11 /site-packages /sklearn /utils /sparsefuncs_fast.pyx

Sam Chaudry

Upload folder using huggingface_hub

7885a28 verified 3 months ago

21.8 kB

	"""Utilities to work with sparse matrices and arrays written in Cython."""

	# Authors: The scikit-learn developers
	# SPDX-License-Identifier: BSD-3-Clause

	from libc.math cimport fabs, sqrt, isnan
	from libc.stdint cimport intptr_t

	import numpy as np
	from cython cimport floating
	from ..utils._typedefs cimport float64_t, int32_t, int64_t, intp_t, uint64_t


	ctypedef fused integral:
	int32_t
	int64_t


	def csr_row_norms(X):
	"""Squared L2 norm of each row in CSR matrix X."""
	if X.dtype not in [np.float32, np.float64]:
	X = X.astype(np.float64)
	return _sqeuclidean_row_norms_sparse(X.data, X.indptr)


	def _sqeuclidean_row_norms_sparse(
	const floating[::1] X_data,
	const integral[::1] X_indptr,
	):
	cdef:
	integral n_samples = X_indptr.shape[0] - 1
	integral i, j

	dtype = np.float32 if floating is float else np.float64

	cdef floating[::1] squared_row_norms = np.zeros(n_samples, dtype=dtype)

	with nogil:
	for i in range(n_samples):
	for j in range(X_indptr[i], X_indptr[i + 1]):
	squared_row_norms[i] += X_data[j] * X_data[j]

	return np.asarray(squared_row_norms)


	def csr_mean_variance_axis0(X, weights=None, return_sum_weights=False):
	"""Compute mean and variance along axis 0 on a CSR matrix

	Uses a np.float64 accumulator.

	Parameters
	----------
	X : CSR sparse matrix, shape (n_samples, n_features)
	Input data.

	weights : ndarray of shape (n_samples,), dtype=floating, default=None
	If it is set to None samples will be equally weighted.

	.. versionadded:: 0.24

	return_sum_weights : bool, default=False
	If True, returns the sum of weights seen for each feature.

	.. versionadded:: 0.24

	Returns
	-------
	means : float array with shape (n_features,)
	Feature-wise means

	variances : float array with shape (n_features,)
	Feature-wise variances

	sum_weights : ndarray of shape (n_features,), dtype=floating
	Returned if return_sum_weights is True.
	"""
	if X.dtype not in [np.float32, np.float64]:
	X = X.astype(np.float64)

	if weights is None:
	weights = np.ones(X.shape[0], dtype=X.dtype)

	means, variances, sum_weights = _csr_mean_variance_axis0(
	X.data, X.shape[0], X.shape[1], X.indices, X.indptr, weights)

	if return_sum_weights:
	return means, variances, sum_weights
	return means, variances


	def _csr_mean_variance_axis0(
	const floating[::1] X_data,
	uint64_t n_samples,
	uint64_t n_features,
	const integral[:] X_indices,
	const integral[:] X_indptr,
	const floating[:] weights,
	):
	# Implement the function here since variables using fused types
	# cannot be declared directly and can only be passed as function arguments
	cdef:
	intp_t row_ind
	uint64_t feature_idx
	integral i, col_ind
	float64_t diff
	# means[j] contains the mean of feature j
	float64_t[::1] means = np.zeros(n_features)
	# variances[j] contains the variance of feature j
	float64_t[::1] variances = np.zeros(n_features)

	float64_t[::1] sum_weights = np.full(
	fill_value=np.sum(weights, dtype=np.float64), shape=n_features
	)
	float64_t[::1] sum_weights_nz = np.zeros(shape=n_features)
	float64_t[::1] correction = np.zeros(shape=n_features)

	uint64_t[::1] counts = np.full(
	fill_value=weights.shape[0], shape=n_features, dtype=np.uint64
	)
	uint64_t[::1] counts_nz = np.zeros(shape=n_features, dtype=np.uint64)

	for row_ind in range(len(X_indptr) - 1):
	for i in range(X_indptr[row_ind], X_indptr[row_ind + 1]):
	col_ind = X_indices[i]
	if not isnan(X_data[i]):
	means[col_ind] += <float64_t>(X_data[i]) * weights[row_ind]
	# sum of weights where X[:, col_ind] is non-zero
	sum_weights_nz[col_ind] += weights[row_ind]
	# number of non-zero elements of X[:, col_ind]
	counts_nz[col_ind] += 1
	else:
	# sum of weights where X[:, col_ind] is not nan
	sum_weights[col_ind] -= weights[row_ind]
	# number of non nan elements of X[:, col_ind]
	counts[col_ind] -= 1

	for feature_idx in range(n_features):
	means[feature_idx] /= sum_weights[feature_idx]

	for row_ind in range(len(X_indptr) - 1):
	for i in range(X_indptr[row_ind], X_indptr[row_ind + 1]):
	col_ind = X_indices[i]
	if not isnan(X_data[i]):
	diff = X_data[i] - means[col_ind]
	# correction term of the corrected 2 pass algorithm.
	# See "Algorithms for computing the sample variance: analysis
	# and recommendations", by Chan, Golub, and LeVeque.
	correction[col_ind] += diff * weights[row_ind]
	variances[col_ind] += diff * diff * weights[row_ind]

	for feature_idx in range(n_features):
	if counts[feature_idx] != counts_nz[feature_idx]:
	correction[feature_idx] -= (
	sum_weights[feature_idx] - sum_weights_nz[feature_idx]
	) * means[feature_idx]
	correction[feature_idx] = correction[feature_idx]**2 / sum_weights[feature_idx]
	if counts[feature_idx] != counts_nz[feature_idx]:
	# only compute it when it's guaranteed to be non-zero to avoid
	# catastrophic cancellation.
	variances[feature_idx] += (
	sum_weights[feature_idx] - sum_weights_nz[feature_idx]
	) * means[feature_idx]**2
	variances[feature_idx] = (
	(variances[feature_idx] - correction[feature_idx]) /
	sum_weights[feature_idx]
	)

	if floating is float:
	return (
	np.array(means, dtype=np.float32),
	np.array(variances, dtype=np.float32),
	np.array(sum_weights, dtype=np.float32),
	)
	else:
	return (
	np.asarray(means), np.asarray(variances), np.asarray(sum_weights)
	)


	def csc_mean_variance_axis0(X, weights=None, return_sum_weights=False):
	"""Compute mean and variance along axis 0 on a CSC matrix

	Uses a np.float64 accumulator.

	Parameters
	----------
	X : CSC sparse matrix, shape (n_samples, n_features)
	Input data.

	weights : ndarray of shape (n_samples,), dtype=floating, default=None
	If it is set to None samples will be equally weighted.

	.. versionadded:: 0.24

	return_sum_weights : bool, default=False
	If True, returns the sum of weights seen for each feature.

	.. versionadded:: 0.24

	Returns
	-------
	means : float array with shape (n_features,)
	Feature-wise means

	variances : float array with shape (n_features,)
	Feature-wise variances

	sum_weights : ndarray of shape (n_features,), dtype=floating
	Returned if return_sum_weights is True.
	"""
	if X.dtype not in [np.float32, np.float64]:
	X = X.astype(np.float64)

	if weights is None:
	weights = np.ones(X.shape[0], dtype=X.dtype)

	means, variances, sum_weights = _csc_mean_variance_axis0(
	X.data, X.shape[0], X.shape[1], X.indices, X.indptr, weights)

	if return_sum_weights:
	return means, variances, sum_weights
	return means, variances


	def _csc_mean_variance_axis0(
	const floating[::1] X_data,
	uint64_t n_samples,
	uint64_t n_features,
	const integral[:] X_indices,
	const integral[:] X_indptr,
	const floating[:] weights,
	):
	# Implement the function here since variables using fused types
	# cannot be declared directly and can only be passed as function arguments
	cdef:
	integral i, row_ind
	uint64_t feature_idx, col_ind
	float64_t diff
	# means[j] contains the mean of feature j
	float64_t[::1] means = np.zeros(n_features)
	# variances[j] contains the variance of feature j
	float64_t[::1] variances = np.zeros(n_features)

	float64_t[::1] sum_weights = np.full(
	fill_value=np.sum(weights, dtype=np.float64), shape=n_features
	)
	float64_t[::1] sum_weights_nz = np.zeros(shape=n_features)
	float64_t[::1] correction = np.zeros(shape=n_features)

	uint64_t[::1] counts = np.full(
	fill_value=weights.shape[0], shape=n_features, dtype=np.uint64
	)
	uint64_t[::1] counts_nz = np.zeros(shape=n_features, dtype=np.uint64)

	for col_ind in range(n_features):
	for i in range(X_indptr[col_ind], X_indptr[col_ind + 1]):
	row_ind = X_indices[i]
	if not isnan(X_data[i]):
	means[col_ind] += <float64_t>(X_data[i]) * weights[row_ind]
	# sum of weights where X[:, col_ind] is non-zero
	sum_weights_nz[col_ind] += weights[row_ind]
	# number of non-zero elements of X[:, col_ind]
	counts_nz[col_ind] += 1
	else:
	# sum of weights where X[:, col_ind] is not nan
	sum_weights[col_ind] -= weights[row_ind]
	# number of non nan elements of X[:, col_ind]
	counts[col_ind] -= 1

	for feature_idx in range(n_features):
	means[feature_idx] /= sum_weights[feature_idx]

	for col_ind in range(n_features):
	for i in range(X_indptr[col_ind], X_indptr[col_ind + 1]):
	row_ind = X_indices[i]
	if not isnan(X_data[i]):
	diff = X_data[i] - means[col_ind]
	# correction term of the corrected 2 pass algorithm.
	# See "Algorithms for computing the sample variance: analysis
	# and recommendations", by Chan, Golub, and LeVeque.
	correction[col_ind] += diff * weights[row_ind]
	variances[col_ind] += diff * diff * weights[row_ind]

	for feature_idx in range(n_features):
	if counts[feature_idx] != counts_nz[feature_idx]:
	correction[feature_idx] -= (
	sum_weights[feature_idx] - sum_weights_nz[feature_idx]
	) * means[feature_idx]
	correction[feature_idx] = correction[feature_idx]**2 / sum_weights[feature_idx]
	if counts[feature_idx] != counts_nz[feature_idx]:
	# only compute it when it's guaranteed to be non-zero to avoid
	# catastrophic cancellation.
	variances[feature_idx] += (
	sum_weights[feature_idx] - sum_weights_nz[feature_idx]
	) * means[feature_idx]**2
	variances[feature_idx] = (
	(variances[feature_idx] - correction[feature_idx])
	) / sum_weights[feature_idx]

	if floating is float:
	return (np.array(means, dtype=np.float32),
	np.array(variances, dtype=np.float32),
	np.array(sum_weights, dtype=np.float32))
	else:
	return (
	np.asarray(means), np.asarray(variances), np.asarray(sum_weights)
	)


	def incr_mean_variance_axis0(X, last_mean, last_var, last_n, weights=None):
	"""Compute mean and variance along axis 0 on a CSR or CSC matrix.

	last_mean, last_var are the statistics computed at the last step by this
	function. Both must be initialized to 0.0. last_n is the
	number of samples encountered until now and is initialized at 0.

	Parameters
	----------
	X : CSR or CSC sparse matrix, shape (n_samples, n_features)
	Input data.

	last_mean : float array with shape (n_features,)
	Array of feature-wise means to update with the new data X.

	last_var : float array with shape (n_features,)
	Array of feature-wise var to update with the new data X.

	last_n : float array with shape (n_features,)
	Sum of the weights seen so far (if weights are all set to 1
	this will be the same as number of samples seen so far, before X).

	weights : float array with shape (n_samples,) or None. If it is set
	to None samples will be equally weighted.

	Returns
	-------
	updated_mean : float array with shape (n_features,)
	Feature-wise means

	updated_variance : float array with shape (n_features,)
	Feature-wise variances

	updated_n : int array with shape (n_features,)
	Updated number of samples seen

	Notes
	-----
	NaNs are ignored during the computation.

	References
	----------
	T. Chan, G. Golub, R. LeVeque. Algorithms for computing the sample
	variance: recommendations, The American Statistician, Vol. 37, No. 3,
	pp. 242-247

	Also, see the non-sparse implementation of this in
	`utils.extmath._batch_mean_variance_update`.

	"""
	if X.dtype not in [np.float32, np.float64]:
	X = X.astype(np.float64)
	X_dtype = X.dtype
	if weights is None:
	weights = np.ones(X.shape[0], dtype=X_dtype)
	elif weights.dtype not in [np.float32, np.float64]:
	weights = weights.astype(np.float64, copy=False)
	if last_n.dtype not in [np.float32, np.float64]:
	last_n = last_n.astype(np.float64, copy=False)

	return _incr_mean_variance_axis0(X.data,
	np.sum(weights),
	X.shape[1],
	X.indices,
	X.indptr,
	X.format,
	last_mean.astype(X_dtype, copy=False),
	last_var.astype(X_dtype, copy=False),
	last_n.astype(X_dtype, copy=False),
	weights.astype(X_dtype, copy=False))


	def _incr_mean_variance_axis0(
	const floating[:] X_data,
	floating n_samples,
	uint64_t n_features,
	const int[:] X_indices,
	# X_indptr might be either int32 or int64
	const integral[:] X_indptr,
	str X_format,
	floating[:] last_mean,
	floating[:] last_var,
	floating[:] last_n,
	# previous sum of the weights (ie float)
	const floating[:] weights,
	):
	# Implement the function here since variables using fused types
	# cannot be declared directly and can only be passed as function arguments
	cdef:
	uint64_t i

	# last = stats until now
	# new = the current increment
	# updated = the aggregated stats
	# when arrays, they are indexed by i per-feature
	floating[::1] new_mean
	floating[::1] new_var
	floating[::1] updated_mean
	floating[::1] updated_var

	if floating is float:
	dtype = np.float32
	else:
	dtype = np.float64

	new_mean = np.zeros(n_features, dtype=dtype)
	new_var = np.zeros_like(new_mean, dtype=dtype)
	updated_mean = np.zeros_like(new_mean, dtype=dtype)
	updated_var = np.zeros_like(new_mean, dtype=dtype)

	cdef:
	floating[::1] new_n
	floating[::1] updated_n
	floating[::1] last_over_new_n

	# Obtain new stats first
	updated_n = np.zeros(shape=n_features, dtype=dtype)
	last_over_new_n = np.zeros_like(updated_n, dtype=dtype)

	# X can be a CSR or CSC matrix
	if X_format == 'csr':
	new_mean, new_var, new_n = _csr_mean_variance_axis0(
	X_data, n_samples, n_features, X_indices, X_indptr, weights)
	else: # X_format == 'csc'
	new_mean, new_var, new_n = _csc_mean_variance_axis0(
	X_data, n_samples, n_features, X_indices, X_indptr, weights)

	# First pass
	cdef bint is_first_pass = True
	for i in range(n_features):
	if last_n[i] > 0:
	is_first_pass = False
	break

	if is_first_pass:
	return np.asarray(new_mean), np.asarray(new_var), np.asarray(new_n)

	for i in range(n_features):
	updated_n[i] = last_n[i] + new_n[i]

	# Next passes
	for i in range(n_features):
	if new_n[i] > 0:
	last_over_new_n[i] = dtype(last_n[i]) / dtype(new_n[i])
	# Unnormalized stats
	last_mean[i] *= last_n[i]
	last_var[i] *= last_n[i]
	new_mean[i] *= new_n[i]
	new_var[i] *= new_n[i]
	# Update stats
	updated_var[i] = (
	last_var[i] + new_var[i] +
	last_over_new_n[i] / updated_n[i] *
	(last_mean[i] / last_over_new_n[i] - new_mean[i])**2
	)
	updated_mean[i] = (last_mean[i] + new_mean[i]) / updated_n[i]
	updated_var[i] /= updated_n[i]
	else:
	updated_var[i] = last_var[i]
	updated_mean[i] = last_mean[i]
	updated_n[i] = last_n[i]

	return (
	np.asarray(updated_mean),
	np.asarray(updated_var),
	np.asarray(updated_n),
	)


	def inplace_csr_row_normalize_l1(X):
	"""Normalize inplace the rows of a CSR matrix or array by their L1 norm.

	Parameters
	----------
	X : scipy.sparse.csr_matrix and scipy.sparse.csr_array, \
	shape=(n_samples, n_features)
	The input matrix or array to be modified inplace.

	Examples
	--------
	>>> from scipy.sparse import csr_matrix
	>>> from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l1
	>>> import numpy as np
	>>> indptr = np.array([0, 2, 3, 4])
	>>> indices = np.array([0, 1, 2, 3])
	>>> data = np.array([1.0, 2.0, 3.0, 4.0])
	>>> X = csr_matrix((data, indices, indptr), shape=(3, 4))
	>>> X.toarray()
	array([[1., 2., 0., 0.],
	[0., 0., 3., 0.],
	[0., 0., 0., 4.]])
	>>> inplace_csr_row_normalize_l1(X)
	>>> X.toarray()
	array([[0.33... , 0.66... , 0. , 0. ],
	[0. , 0. , 1. , 0. ],
	[0. , 0. , 0. , 1. ]])
	"""
	_inplace_csr_row_normalize_l1(X.data, X.shape, X.indices, X.indptr)


	def _inplace_csr_row_normalize_l1(
	floating[:] X_data,
	shape,
	const integral[:] X_indices,
	const integral[:] X_indptr,
	):
	cdef:
	uint64_t n_samples = shape[0]

	# the column indices for row i are stored in:
	# indices[indptr[i]:indices[i+1]]
	# and their corresponding values are stored in:
	# data[indptr[i]:indptr[i+1]]
	uint64_t i
	integral j
	double sum_

	for i in range(n_samples):
	sum_ = 0.0

	for j in range(X_indptr[i], X_indptr[i + 1]):
	sum_ += fabs(X_data[j])

	if sum_ == 0.0:
	# do not normalize empty rows (can happen if CSR is not pruned
	# correctly)
	continue

	for j in range(X_indptr[i], X_indptr[i + 1]):
	X_data[j] /= sum_


	def inplace_csr_row_normalize_l2(X):
	"""Normalize inplace the rows of a CSR matrix or array by their L2 norm.

	Parameters
	----------
	X : scipy.sparse.csr_matrix, shape=(n_samples, n_features)
	The input matrix or array to be modified inplace.

	Examples
	--------
	>>> from scipy.sparse import csr_matrix
	>>> from sklearn.utils.sparsefuncs_fast import inplace_csr_row_normalize_l2
	>>> import numpy as np
	>>> indptr = np.array([0, 2, 3, 4])
	>>> indices = np.array([0, 1, 2, 3])
	>>> data = np.array([1.0, 2.0, 3.0, 4.0])
	>>> X = csr_matrix((data, indices, indptr), shape=(3, 4))
	>>> X.toarray()
	array([[1., 2., 0., 0.],
	[0., 0., 3., 0.],
	[0., 0., 0., 4.]])
	>>> inplace_csr_row_normalize_l2(X)
	>>> X.toarray()
	array([[0.44... , 0.89... , 0. , 0. ],
	[0. , 0. , 1. , 0. ],
	[0. , 0. , 0. , 1. ]])
	"""
	_inplace_csr_row_normalize_l2(X.data, X.shape, X.indices, X.indptr)


	def _inplace_csr_row_normalize_l2(
	floating[:] X_data,
	shape,
	const integral[:] X_indices,
	const integral[:] X_indptr,
	):
	cdef:
	uint64_t n_samples = shape[0]
	uint64_t i
	integral j
	double sum_

	for i in range(n_samples):
	sum_ = 0.0

	for j in range(X_indptr[i], X_indptr[i + 1]):
	sum_ += (X_data[j] * X_data[j])

	if sum_ == 0.0:
	# do not normalize empty rows (can happen if CSR is not pruned
	# correctly)
	continue

	sum_ = sqrt(sum_)

	for j in range(X_indptr[i], X_indptr[i + 1]):
	X_data[j] /= sum_


	def assign_rows_csr(
	X,
	const intptr_t[:] X_rows,
	const intptr_t[:] out_rows,
	floating[:, ::1] out,
	):
	"""Densify selected rows of a CSR matrix into a preallocated array.

	Like out[out_rows] = X[X_rows].toarray() but without copying.
	No-copy supported for both dtype=np.float32 and dtype=np.float64.

	Parameters
	----------
	X : scipy.sparse.csr_matrix, shape=(n_samples, n_features)
	X_rows : array, dtype=np.intp, shape=n_rows
	out_rows : array, dtype=np.intp, shape=n_rows
	out : array, shape=(arbitrary, n_features)
	"""
	cdef:
	# intptr_t (npy_intp, np.intp in Python) is what np.where returns,
	# but int is what scipy.sparse uses.
	intp_t i, ind, j, k
	intptr_t rX
	const floating[:] data = X.data
	const int32_t[:] indices = X.indices
	const int32_t[:] indptr = X.indptr

	if X_rows.shape[0] != out_rows.shape[0]:
	raise ValueError("cannot assign %d rows to %d"
	% (X_rows.shape[0], out_rows.shape[0]))

	with nogil:
	for k in range(out_rows.shape[0]):
	out[out_rows[k]] = 0.0

	for i in range(X_rows.shape[0]):
	rX = X_rows[i]
	for ind in range(indptr[rX], indptr[rX + 1]):
	j = indices[ind]
	out[out_rows[i], j] = data[ind]