File size: 3,642 Bytes

7885a28

# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause

import numpy as np
from scipy.optimize import linear_sum_assignment

from ...utils._param_validation import StrOptions, validate_params
from ...utils.validation import check_array, check_consistent_length

__all__ = ["consensus_score"]


def _check_rows_and_columns(a, b):
    """Unpacks the row and column arrays and checks their shape."""
    check_consistent_length(*a)
    check_consistent_length(*b)
    checks = lambda x: check_array(x, ensure_2d=False)
    a_rows, a_cols = map(checks, a)
    b_rows, b_cols = map(checks, b)
    return a_rows, a_cols, b_rows, b_cols


def _jaccard(a_rows, a_cols, b_rows, b_cols):
    """Jaccard coefficient on the elements of the two biclusters."""
    intersection = (a_rows * b_rows).sum() * (a_cols * b_cols).sum()

    a_size = a_rows.sum() * a_cols.sum()
    b_size = b_rows.sum() * b_cols.sum()

    return intersection / (a_size + b_size - intersection)


def _pairwise_similarity(a, b, similarity):
    """Computes pairwise similarity matrix.

    result[i, j] is the Jaccard coefficient of a's bicluster i and b's
    bicluster j.

    """
    a_rows, a_cols, b_rows, b_cols = _check_rows_and_columns(a, b)
    n_a = a_rows.shape[0]
    n_b = b_rows.shape[0]
    result = np.array(
        [
            [similarity(a_rows[i], a_cols[i], b_rows[j], b_cols[j]) for j in range(n_b)]
            for i in range(n_a)
        ]
    )
    return result


@validate_params(
    {
        "a": [tuple],
        "b": [tuple],
        "similarity": [callable, StrOptions({"jaccard"})],
    },
    prefer_skip_nested_validation=True,
)
def consensus_score(a, b, *, similarity="jaccard"):
    """The similarity of two sets of biclusters.

    Similarity between individual biclusters is computed. Then the best
    matching between sets is found by solving a linear sum assignment problem,
    using a modified Jonker-Volgenant algorithm.
    The final score is the sum of similarities divided by the size of
    the larger set.

    Read more in the :ref:`User Guide <biclustering>`.

    Parameters
    ----------
    a : tuple (rows, columns)
        Tuple of row and column indicators for a set of biclusters.

    b : tuple (rows, columns)
        Another set of biclusters like ``a``.

    similarity : 'jaccard' or callable, default='jaccard'
        May be the string "jaccard" to use the Jaccard coefficient, or
        any function that takes four arguments, each of which is a 1d
        indicator vector: (a_rows, a_columns, b_rows, b_columns).

    Returns
    -------
    consensus_score : float
       Consensus score, a non-negative value, sum of similarities
       divided by size of larger set.

    See Also
    --------
    scipy.optimize.linear_sum_assignment : Solve the linear sum assignment problem.

    References
    ----------
    * Hochreiter, Bodenhofer, et. al., 2010. `FABIA: factor analysis
      for bicluster acquisition
      <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2881408/>`__.

    Examples
    --------
    >>> from sklearn.metrics import consensus_score
    >>> a = ([[True, False], [False, True]], [[False, True], [True, False]])
    >>> b = ([[False, True], [True, False]], [[True, False], [False, True]])
    >>> consensus_score(a, b, similarity='jaccard')
    np.float64(1.0)
    """
    if similarity == "jaccard":
        similarity = _jaccard
    matrix = _pairwise_similarity(a, b, similarity)
    row_indices, col_indices = linear_sum_assignment(1.0 - matrix)
    n_a = len(a[0])
    n_b = len(b[0])
    return matrix[row_indices, col_indices].sum() / max(n_a, n_b)