ModelForge
/

spam-classifier

Text Classification

binary-classification

Model card Files Files and versions Community

spam-classifier / venv /lib /python3.11 /site-packages /sklearn /metrics /cluster /_bicluster.py

Sam Chaudry

Upload folder using huggingface_hub

7885a28 verified about 1 month ago

3.64 kB

	# Authors: The scikit-learn developers
	# SPDX-License-Identifier: BSD-3-Clause

	import numpy as np
	from scipy.optimize import linear_sum_assignment

	from ...utils._param_validation import StrOptions, validate_params
	from ...utils.validation import check_array, check_consistent_length

	__all__ = ["consensus_score"]


	def _check_rows_and_columns(a, b):
	"""Unpacks the row and column arrays and checks their shape."""
	check_consistent_length(*a)
	check_consistent_length(*b)
	checks = lambda x: check_array(x, ensure_2d=False)
	a_rows, a_cols = map(checks, a)
	b_rows, b_cols = map(checks, b)
	return a_rows, a_cols, b_rows, b_cols


	def _jaccard(a_rows, a_cols, b_rows, b_cols):
	"""Jaccard coefficient on the elements of the two biclusters."""
	intersection = (a_rows * b_rows).sum() * (a_cols * b_cols).sum()

	a_size = a_rows.sum() * a_cols.sum()
	b_size = b_rows.sum() * b_cols.sum()

	return intersection / (a_size + b_size - intersection)


	def _pairwise_similarity(a, b, similarity):
	"""Computes pairwise similarity matrix.

	result[i, j] is the Jaccard coefficient of a's bicluster i and b's
	bicluster j.

	"""
	a_rows, a_cols, b_rows, b_cols = _check_rows_and_columns(a, b)
	n_a = a_rows.shape[0]
	n_b = b_rows.shape[0]
	result = np.array(
	[
	[similarity(a_rows[i], a_cols[i], b_rows[j], b_cols[j]) for j in range(n_b)]
	for i in range(n_a)
	]
	)
	return result


	@validate_params(
	{
	"a": [tuple],
	"b": [tuple],
	"similarity": [callable, StrOptions({"jaccard"})],
	},
	prefer_skip_nested_validation=True,
	)
	def consensus_score(a, b, *, similarity="jaccard"):
	"""The similarity of two sets of biclusters.

	Similarity between individual biclusters is computed. Then the best
	matching between sets is found by solving a linear sum assignment problem,
	using a modified Jonker-Volgenant algorithm.
	The final score is the sum of similarities divided by the size of
	the larger set.

	Read more in the :ref:`User Guide <biclustering>`.

	Parameters
	----------
	a : tuple (rows, columns)
	Tuple of row and column indicators for a set of biclusters.

	b : tuple (rows, columns)
	Another set of biclusters like ``a``.

	similarity : 'jaccard' or callable, default='jaccard'
	May be the string "jaccard" to use the Jaccard coefficient, or
	any function that takes four arguments, each of which is a 1d
	indicator vector: (a_rows, a_columns, b_rows, b_columns).

	Returns
	-------
	consensus_score : float
	Consensus score, a non-negative value, sum of similarities
	divided by size of larger set.

	See Also
	--------
	scipy.optimize.linear_sum_assignment : Solve the linear sum assignment problem.

	References
	----------
	* Hochreiter, Bodenhofer, et. al., 2010. `FABIA: factor analysis
	for bicluster acquisition
	<https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2881408/>`__.

	Examples
	--------
	>>> from sklearn.metrics import consensus_score
	>>> a = ([[True, False], [False, True]], [[False, True], [True, False]])
	>>> b = ([[False, True], [True, False]], [[True, False], [False, True]])
	>>> consensus_score(a, b, similarity='jaccard')
	np.float64(1.0)
	"""
	if similarity == "jaccard":
	similarity = _jaccard
	matrix = _pairwise_similarity(a, b, similarity)
	row_indices, col_indices = linear_sum_assignment(1.0 - matrix)
	n_a = len(a[0])
	n_b = len(b[0])
	return matrix[row_indices, col_indices].sum() / max(n_a, n_b)