File size: 3,642 Bytes
7885a28 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import numpy as np
from scipy.optimize import linear_sum_assignment
from ...utils._param_validation import StrOptions, validate_params
from ...utils.validation import check_array, check_consistent_length
__all__ = ["consensus_score"]
def _check_rows_and_columns(a, b):
"""Unpacks the row and column arrays and checks their shape."""
check_consistent_length(*a)
check_consistent_length(*b)
checks = lambda x: check_array(x, ensure_2d=False)
a_rows, a_cols = map(checks, a)
b_rows, b_cols = map(checks, b)
return a_rows, a_cols, b_rows, b_cols
def _jaccard(a_rows, a_cols, b_rows, b_cols):
"""Jaccard coefficient on the elements of the two biclusters."""
intersection = (a_rows * b_rows).sum() * (a_cols * b_cols).sum()
a_size = a_rows.sum() * a_cols.sum()
b_size = b_rows.sum() * b_cols.sum()
return intersection / (a_size + b_size - intersection)
def _pairwise_similarity(a, b, similarity):
"""Computes pairwise similarity matrix.
result[i, j] is the Jaccard coefficient of a's bicluster i and b's
bicluster j.
"""
a_rows, a_cols, b_rows, b_cols = _check_rows_and_columns(a, b)
n_a = a_rows.shape[0]
n_b = b_rows.shape[0]
result = np.array(
[
[similarity(a_rows[i], a_cols[i], b_rows[j], b_cols[j]) for j in range(n_b)]
for i in range(n_a)
]
)
return result
@validate_params(
{
"a": [tuple],
"b": [tuple],
"similarity": [callable, StrOptions({"jaccard"})],
},
prefer_skip_nested_validation=True,
)
def consensus_score(a, b, *, similarity="jaccard"):
"""The similarity of two sets of biclusters.
Similarity between individual biclusters is computed. Then the best
matching between sets is found by solving a linear sum assignment problem,
using a modified Jonker-Volgenant algorithm.
The final score is the sum of similarities divided by the size of
the larger set.
Read more in the :ref:`User Guide <biclustering>`.
Parameters
----------
a : tuple (rows, columns)
Tuple of row and column indicators for a set of biclusters.
b : tuple (rows, columns)
Another set of biclusters like ``a``.
similarity : 'jaccard' or callable, default='jaccard'
May be the string "jaccard" to use the Jaccard coefficient, or
any function that takes four arguments, each of which is a 1d
indicator vector: (a_rows, a_columns, b_rows, b_columns).
Returns
-------
consensus_score : float
Consensus score, a non-negative value, sum of similarities
divided by size of larger set.
See Also
--------
scipy.optimize.linear_sum_assignment : Solve the linear sum assignment problem.
References
----------
* Hochreiter, Bodenhofer, et. al., 2010. `FABIA: factor analysis
for bicluster acquisition
<https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2881408/>`__.
Examples
--------
>>> from sklearn.metrics import consensus_score
>>> a = ([[True, False], [False, True]], [[False, True], [True, False]])
>>> b = ([[False, True], [True, False]], [[True, False], [False, True]])
>>> consensus_score(a, b, similarity='jaccard')
np.float64(1.0)
"""
if similarity == "jaccard":
similarity = _jaccard
matrix = _pairwise_similarity(a, b, similarity)
row_indices, col_indices = linear_sum_assignment(1.0 - matrix)
n_a = len(a[0])
n_b = len(b[0])
return matrix[row_indices, col_indices].sum() / max(n_a, n_b)
|