|
"""Metrics to assess performance on classification task given scores. |
|
|
|
Functions named as ``*_score`` return a scalar value to maximize: the higher |
|
the better. |
|
|
|
Function named as ``*_error`` or ``*_loss`` return a scalar value to minimize: |
|
the lower the better. |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
import warnings |
|
from functools import partial |
|
from numbers import Integral, Real |
|
|
|
import numpy as np |
|
from scipy.integrate import trapezoid |
|
from scipy.sparse import csr_matrix, issparse |
|
from scipy.stats import rankdata |
|
|
|
from ..exceptions import UndefinedMetricWarning |
|
from ..preprocessing import label_binarize |
|
from ..utils import ( |
|
assert_all_finite, |
|
check_array, |
|
check_consistent_length, |
|
column_or_1d, |
|
) |
|
from ..utils._encode import _encode, _unique |
|
from ..utils._param_validation import Hidden, Interval, StrOptions, validate_params |
|
from ..utils.extmath import stable_cumsum |
|
from ..utils.multiclass import type_of_target |
|
from ..utils.sparsefuncs import count_nonzero |
|
from ..utils.validation import _check_pos_label_consistency, _check_sample_weight |
|
from ._base import _average_binary_score, _average_multiclass_ovo_score |
|
|
|
|
|
@validate_params( |
|
{"x": ["array-like"], "y": ["array-like"]}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def auc(x, y): |
|
"""Compute Area Under the Curve (AUC) using the trapezoidal rule. |
|
|
|
This is a general function, given points on a curve. For computing the |
|
area under the ROC-curve, see :func:`roc_auc_score`. For an alternative |
|
way to summarize a precision-recall curve, see |
|
:func:`average_precision_score`. |
|
|
|
Parameters |
|
---------- |
|
x : array-like of shape (n,) |
|
X coordinates. These must be either monotonic increasing or monotonic |
|
decreasing. |
|
y : array-like of shape (n,) |
|
Y coordinates. |
|
|
|
Returns |
|
------- |
|
auc : float |
|
Area Under the Curve. |
|
|
|
See Also |
|
-------- |
|
roc_auc_score : Compute the area under the ROC curve. |
|
average_precision_score : Compute average precision from prediction scores. |
|
precision_recall_curve : Compute precision-recall pairs for different |
|
probability thresholds. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn import metrics |
|
>>> y = np.array([1, 1, 2, 2]) |
|
>>> pred = np.array([0.1, 0.4, 0.35, 0.8]) |
|
>>> fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=2) |
|
>>> metrics.auc(fpr, tpr) |
|
np.float64(0.75) |
|
""" |
|
check_consistent_length(x, y) |
|
x = column_or_1d(x) |
|
y = column_or_1d(y) |
|
|
|
if x.shape[0] < 2: |
|
raise ValueError( |
|
"At least 2 points are needed to compute area under curve, but x.shape = %s" |
|
% x.shape |
|
) |
|
|
|
direction = 1 |
|
dx = np.diff(x) |
|
if np.any(dx < 0): |
|
if np.all(dx <= 0): |
|
direction = -1 |
|
else: |
|
raise ValueError("x is neither increasing nor decreasing : {}.".format(x)) |
|
|
|
area = direction * trapezoid(y, x) |
|
if isinstance(area, np.memmap): |
|
|
|
|
|
|
|
area = area.dtype.type(area) |
|
return area |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like"], |
|
"y_score": ["array-like"], |
|
"average": [StrOptions({"micro", "samples", "weighted", "macro"}), None], |
|
"pos_label": [Real, str, "boolean"], |
|
"sample_weight": ["array-like", None], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def average_precision_score( |
|
y_true, y_score, *, average="macro", pos_label=1, sample_weight=None |
|
): |
|
"""Compute average precision (AP) from prediction scores. |
|
|
|
AP summarizes a precision-recall curve as the weighted mean of precisions |
|
achieved at each threshold, with the increase in recall from the previous |
|
threshold used as the weight: |
|
|
|
.. math:: |
|
\\text{AP} = \\sum_n (R_n - R_{n-1}) P_n |
|
|
|
where :math:`P_n` and :math:`R_n` are the precision and recall at the nth |
|
threshold [1]_. This implementation is not interpolated and is different |
|
from computing the area under the precision-recall curve with the |
|
trapezoidal rule, which uses linear interpolation and can be too |
|
optimistic. |
|
|
|
Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`. |
|
|
|
Parameters |
|
---------- |
|
y_true : array-like of shape (n_samples,) or (n_samples, n_classes) |
|
True binary labels or binary label indicators. |
|
|
|
y_score : array-like of shape (n_samples,) or (n_samples, n_classes) |
|
Target scores, can either be probability estimates of the positive |
|
class, confidence values, or non-thresholded measure of decisions |
|
(as returned by :term:`decision_function` on some classifiers). |
|
For :term:`decision_function` scores, values greater than or equal to |
|
zero should indicate the positive class. |
|
|
|
average : {'micro', 'samples', 'weighted', 'macro'} or None, \ |
|
default='macro' |
|
If ``None``, the scores for each class are returned. Otherwise, |
|
this determines the type of averaging performed on the data: |
|
|
|
``'micro'``: |
|
Calculate metrics globally by considering each element of the label |
|
indicator matrix as a label. |
|
``'macro'``: |
|
Calculate metrics for each label, and find their unweighted |
|
mean. This does not take label imbalance into account. |
|
``'weighted'``: |
|
Calculate metrics for each label, and find their average, weighted |
|
by support (the number of true instances for each label). |
|
``'samples'``: |
|
Calculate metrics for each instance, and find their average. |
|
|
|
Will be ignored when ``y_true`` is binary. |
|
|
|
pos_label : int, float, bool or str, default=1 |
|
The label of the positive class. Only applied to binary ``y_true``. |
|
For multilabel-indicator ``y_true``, ``pos_label`` is fixed to 1. |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
Returns |
|
------- |
|
average_precision : float |
|
Average precision score. |
|
|
|
See Also |
|
-------- |
|
roc_auc_score : Compute the area under the ROC curve. |
|
precision_recall_curve : Compute precision-recall pairs for different |
|
probability thresholds. |
|
|
|
Notes |
|
----- |
|
.. versionchanged:: 0.19 |
|
Instead of linearly interpolating between operating points, precisions |
|
are weighted by the change in recall since the last operating point. |
|
|
|
References |
|
---------- |
|
.. [1] `Wikipedia entry for the Average precision |
|
<https://en.wikipedia.org/w/index.php?title=Information_retrieval& |
|
oldid=793358396#Average_precision>`_ |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.metrics import average_precision_score |
|
>>> y_true = np.array([0, 0, 1, 1]) |
|
>>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) |
|
>>> average_precision_score(y_true, y_scores) |
|
np.float64(0.83...) |
|
>>> y_true = np.array([0, 0, 1, 1, 2, 2]) |
|
>>> y_scores = np.array([ |
|
... [0.7, 0.2, 0.1], |
|
... [0.4, 0.3, 0.3], |
|
... [0.1, 0.8, 0.1], |
|
... [0.2, 0.3, 0.5], |
|
... [0.4, 0.4, 0.2], |
|
... [0.1, 0.2, 0.7], |
|
... ]) |
|
>>> average_precision_score(y_true, y_scores) |
|
np.float64(0.77...) |
|
""" |
|
|
|
def _binary_uninterpolated_average_precision( |
|
y_true, y_score, pos_label=1, sample_weight=None |
|
): |
|
precision, recall, _ = precision_recall_curve( |
|
y_true, y_score, pos_label=pos_label, sample_weight=sample_weight |
|
) |
|
|
|
|
|
|
|
|
|
return max(0.0, -np.sum(np.diff(recall) * np.array(precision)[:-1])) |
|
|
|
y_type = type_of_target(y_true, input_name="y_true") |
|
|
|
|
|
|
|
present_labels = np.unique(y_true).tolist() |
|
|
|
if y_type == "binary": |
|
if len(present_labels) == 2 and pos_label not in present_labels: |
|
raise ValueError( |
|
f"pos_label={pos_label} is not a valid label. It should be " |
|
f"one of {present_labels}" |
|
) |
|
|
|
elif y_type == "multilabel-indicator" and pos_label != 1: |
|
raise ValueError( |
|
"Parameter pos_label is fixed to 1 for multilabel-indicator y_true. " |
|
"Do not set pos_label or set pos_label to 1." |
|
) |
|
|
|
elif y_type == "multiclass": |
|
if pos_label != 1: |
|
raise ValueError( |
|
"Parameter pos_label is fixed to 1 for multiclass y_true. " |
|
"Do not set pos_label or set pos_label to 1." |
|
) |
|
y_true = label_binarize(y_true, classes=present_labels) |
|
|
|
average_precision = partial( |
|
_binary_uninterpolated_average_precision, pos_label=pos_label |
|
) |
|
return _average_binary_score( |
|
average_precision, y_true, y_score, average, sample_weight=sample_weight |
|
) |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like"], |
|
"y_score": ["array-like"], |
|
"pos_label": [Real, str, "boolean", None], |
|
"sample_weight": ["array-like", None], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def det_curve(y_true, y_score, pos_label=None, sample_weight=None): |
|
"""Compute error rates for different probability thresholds. |
|
|
|
.. note:: |
|
This metric is used for evaluation of ranking and error tradeoffs of |
|
a binary classification task. |
|
|
|
Read more in the :ref:`User Guide <det_curve>`. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
Parameters |
|
---------- |
|
y_true : ndarray of shape (n_samples,) |
|
True binary labels. If labels are not either {-1, 1} or {0, 1}, then |
|
pos_label should be explicitly given. |
|
|
|
y_score : ndarray of shape of (n_samples,) |
|
Target scores, can either be probability estimates of the positive |
|
class, confidence values, or non-thresholded measure of decisions |
|
(as returned by "decision_function" on some classifiers). |
|
For :term:`decision_function` scores, values greater than or equal to |
|
zero should indicate the positive class. |
|
|
|
pos_label : int, float, bool or str, default=None |
|
The label of the positive class. |
|
When ``pos_label=None``, if `y_true` is in {-1, 1} or {0, 1}, |
|
``pos_label`` is set to 1, otherwise an error will be raised. |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
Returns |
|
------- |
|
fpr : ndarray of shape (n_thresholds,) |
|
False positive rate (FPR) such that element i is the false positive |
|
rate of predictions with score >= thresholds[i]. This is occasionally |
|
referred to as false acceptance probability or fall-out. |
|
|
|
fnr : ndarray of shape (n_thresholds,) |
|
False negative rate (FNR) such that element i is the false negative |
|
rate of predictions with score >= thresholds[i]. This is occasionally |
|
referred to as false rejection or miss rate. |
|
|
|
thresholds : ndarray of shape (n_thresholds,) |
|
Decreasing score values. |
|
|
|
See Also |
|
-------- |
|
DetCurveDisplay.from_estimator : Plot DET curve given an estimator and |
|
some data. |
|
DetCurveDisplay.from_predictions : Plot DET curve given the true and |
|
predicted labels. |
|
DetCurveDisplay : DET curve visualization. |
|
roc_curve : Compute Receiver operating characteristic (ROC) curve. |
|
precision_recall_curve : Compute precision-recall curve. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.metrics import det_curve |
|
>>> y_true = np.array([0, 0, 1, 1]) |
|
>>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) |
|
>>> fpr, fnr, thresholds = det_curve(y_true, y_scores) |
|
>>> fpr |
|
array([0.5, 0.5, 0. ]) |
|
>>> fnr |
|
array([0. , 0.5, 0.5]) |
|
>>> thresholds |
|
array([0.35, 0.4 , 0.8 ]) |
|
""" |
|
fps, tps, thresholds = _binary_clf_curve( |
|
y_true, y_score, pos_label=pos_label, sample_weight=sample_weight |
|
) |
|
|
|
if len(np.unique(y_true)) != 2: |
|
raise ValueError( |
|
"Only one class is present in y_true. Detection error " |
|
"tradeoff curve is not defined in that case." |
|
) |
|
|
|
fns = tps[-1] - tps |
|
p_count = tps[-1] |
|
n_count = fps[-1] |
|
|
|
|
|
first_ind = ( |
|
fps.searchsorted(fps[0], side="right") - 1 |
|
if fps.searchsorted(fps[0], side="right") > 0 |
|
else None |
|
) |
|
|
|
last_ind = tps.searchsorted(tps[-1]) + 1 |
|
sl = slice(first_ind, last_ind) |
|
|
|
|
|
return (fps[sl][::-1] / n_count, fns[sl][::-1] / p_count, thresholds[sl][::-1]) |
|
|
|
|
|
def _binary_roc_auc_score(y_true, y_score, sample_weight=None, max_fpr=None): |
|
"""Binary roc auc score.""" |
|
if len(np.unique(y_true)) != 2: |
|
warnings.warn( |
|
( |
|
"Only one class is present in y_true. ROC AUC score " |
|
"is not defined in that case." |
|
), |
|
UndefinedMetricWarning, |
|
) |
|
return np.nan |
|
|
|
fpr, tpr, _ = roc_curve(y_true, y_score, sample_weight=sample_weight) |
|
if max_fpr is None or max_fpr == 1: |
|
return auc(fpr, tpr) |
|
if max_fpr <= 0 or max_fpr > 1: |
|
raise ValueError("Expected max_fpr in range (0, 1], got: %r" % max_fpr) |
|
|
|
|
|
stop = np.searchsorted(fpr, max_fpr, "right") |
|
x_interp = [fpr[stop - 1], fpr[stop]] |
|
y_interp = [tpr[stop - 1], tpr[stop]] |
|
tpr = np.append(tpr[:stop], np.interp(max_fpr, x_interp, y_interp)) |
|
fpr = np.append(fpr[:stop], max_fpr) |
|
partial_auc = auc(fpr, tpr) |
|
|
|
|
|
|
|
min_area = 0.5 * max_fpr**2 |
|
max_area = max_fpr |
|
return 0.5 * (1 + (partial_auc - min_area) / (max_area - min_area)) |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like"], |
|
"y_score": ["array-like"], |
|
"average": [StrOptions({"micro", "macro", "samples", "weighted"}), None], |
|
"sample_weight": ["array-like", None], |
|
"max_fpr": [Interval(Real, 0.0, 1, closed="right"), None], |
|
"multi_class": [StrOptions({"raise", "ovr", "ovo"})], |
|
"labels": ["array-like", None], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def roc_auc_score( |
|
y_true, |
|
y_score, |
|
*, |
|
average="macro", |
|
sample_weight=None, |
|
max_fpr=None, |
|
multi_class="raise", |
|
labels=None, |
|
): |
|
"""Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) \ |
|
from prediction scores. |
|
|
|
Note: this implementation can be used with binary, multiclass and |
|
multilabel classification, but some restrictions apply (see Parameters). |
|
|
|
Read more in the :ref:`User Guide <roc_metrics>`. |
|
|
|
Parameters |
|
---------- |
|
y_true : array-like of shape (n_samples,) or (n_samples, n_classes) |
|
True labels or binary label indicators. The binary and multiclass cases |
|
expect labels with shape (n_samples,) while the multilabel case expects |
|
binary label indicators with shape (n_samples, n_classes). |
|
|
|
y_score : array-like of shape (n_samples,) or (n_samples, n_classes) |
|
Target scores. |
|
|
|
* In the binary case, it corresponds to an array of shape |
|
`(n_samples,)`. Both probability estimates and non-thresholded |
|
decision values can be provided. The probability estimates correspond |
|
to the **probability of the class with the greater label**, |
|
i.e. `estimator.classes_[1]` and thus |
|
`estimator.predict_proba(X, y)[:, 1]`. The decision values |
|
corresponds to the output of `estimator.decision_function(X, y)`. |
|
See more information in the :ref:`User guide <roc_auc_binary>`; |
|
* In the multiclass case, it corresponds to an array of shape |
|
`(n_samples, n_classes)` of probability estimates provided by the |
|
`predict_proba` method. The probability estimates **must** |
|
sum to 1 across the possible classes. In addition, the order of the |
|
class scores must correspond to the order of ``labels``, |
|
if provided, or else to the numerical or lexicographical order of |
|
the labels in ``y_true``. See more information in the |
|
:ref:`User guide <roc_auc_multiclass>`; |
|
* In the multilabel case, it corresponds to an array of shape |
|
`(n_samples, n_classes)`. Probability estimates are provided by the |
|
`predict_proba` method and the non-thresholded decision values by |
|
the `decision_function` method. The probability estimates correspond |
|
to the **probability of the class with the greater label for each |
|
output** of the classifier. See more information in the |
|
:ref:`User guide <roc_auc_multilabel>`. |
|
|
|
average : {'micro', 'macro', 'samples', 'weighted'} or None, \ |
|
default='macro' |
|
If ``None``, the scores for each class are returned. |
|
Otherwise, this determines the type of averaging performed on the data. |
|
Note: multiclass ROC AUC currently only handles the 'macro' and |
|
'weighted' averages. For multiclass targets, `average=None` is only |
|
implemented for `multi_class='ovr'` and `average='micro'` is only |
|
implemented for `multi_class='ovr'`. |
|
|
|
``'micro'``: |
|
Calculate metrics globally by considering each element of the label |
|
indicator matrix as a label. |
|
``'macro'``: |
|
Calculate metrics for each label, and find their unweighted |
|
mean. This does not take label imbalance into account. |
|
``'weighted'``: |
|
Calculate metrics for each label, and find their average, weighted |
|
by support (the number of true instances for each label). |
|
``'samples'``: |
|
Calculate metrics for each instance, and find their average. |
|
|
|
Will be ignored when ``y_true`` is binary. |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
max_fpr : float > 0 and <= 1, default=None |
|
If not ``None``, the standardized partial AUC [2]_ over the range |
|
[0, max_fpr] is returned. For the multiclass case, ``max_fpr``, |
|
should be either equal to ``None`` or ``1.0`` as AUC ROC partial |
|
computation currently is not supported for multiclass. |
|
|
|
multi_class : {'raise', 'ovr', 'ovo'}, default='raise' |
|
Only used for multiclass targets. Determines the type of configuration |
|
to use. The default value raises an error, so either |
|
``'ovr'`` or ``'ovo'`` must be passed explicitly. |
|
|
|
``'ovr'``: |
|
Stands for One-vs-rest. Computes the AUC of each class |
|
against the rest [3]_ [4]_. This |
|
treats the multiclass case in the same way as the multilabel case. |
|
Sensitive to class imbalance even when ``average == 'macro'``, |
|
because class imbalance affects the composition of each of the |
|
'rest' groupings. |
|
``'ovo'``: |
|
Stands for One-vs-one. Computes the average AUC of all |
|
possible pairwise combinations of classes [5]_. |
|
Insensitive to class imbalance when |
|
``average == 'macro'``. |
|
|
|
labels : array-like of shape (n_classes,), default=None |
|
Only used for multiclass targets. List of labels that index the |
|
classes in ``y_score``. If ``None``, the numerical or lexicographical |
|
order of the labels in ``y_true`` is used. |
|
|
|
Returns |
|
------- |
|
auc : float |
|
Area Under the Curve score. |
|
|
|
See Also |
|
-------- |
|
average_precision_score : Area under the precision-recall curve. |
|
roc_curve : Compute Receiver operating characteristic (ROC) curve. |
|
RocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic |
|
(ROC) curve given an estimator and some data. |
|
RocCurveDisplay.from_predictions : Plot Receiver Operating Characteristic |
|
(ROC) curve given the true and predicted values. |
|
|
|
Notes |
|
----- |
|
The Gini Coefficient is a summary measure of the ranking ability of binary |
|
classifiers. It is expressed using the area under of the ROC as follows: |
|
|
|
G = 2 * AUC - 1 |
|
|
|
Where G is the Gini coefficient and AUC is the ROC-AUC score. This normalisation |
|
will ensure that random guessing will yield a score of 0 in expectation, and it is |
|
upper bounded by 1. |
|
|
|
References |
|
---------- |
|
.. [1] `Wikipedia entry for the Receiver operating characteristic |
|
<https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_ |
|
|
|
.. [2] `Analyzing a portion of the ROC curve. McClish, 1989 |
|
<https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_ |
|
|
|
.. [3] Provost, F., Domingos, P. (2000). Well-trained PETs: Improving |
|
probability estimation trees (Section 6.2), CeDER Working Paper |
|
#IS-00-04, Stern School of Business, New York University. |
|
|
|
.. [4] `Fawcett, T. (2006). An introduction to ROC analysis. Pattern |
|
Recognition Letters, 27(8), 861-874. |
|
<https://www.sciencedirect.com/science/article/pii/S016786550500303X>`_ |
|
|
|
.. [5] `Hand, D.J., Till, R.J. (2001). A Simple Generalisation of the Area |
|
Under the ROC Curve for Multiple Class Classification Problems. |
|
Machine Learning, 45(2), 171-186. |
|
<http://link.springer.com/article/10.1023/A:1010920819831>`_ |
|
.. [6] `Wikipedia entry for the Gini coefficient |
|
<https://en.wikipedia.org/wiki/Gini_coefficient>`_ |
|
|
|
Examples |
|
-------- |
|
Binary case: |
|
|
|
>>> from sklearn.datasets import load_breast_cancer |
|
>>> from sklearn.linear_model import LogisticRegression |
|
>>> from sklearn.metrics import roc_auc_score |
|
>>> X, y = load_breast_cancer(return_X_y=True) |
|
>>> clf = LogisticRegression(solver="liblinear", random_state=0).fit(X, y) |
|
>>> roc_auc_score(y, clf.predict_proba(X)[:, 1]) |
|
np.float64(0.99...) |
|
>>> roc_auc_score(y, clf.decision_function(X)) |
|
np.float64(0.99...) |
|
|
|
Multiclass case: |
|
|
|
>>> from sklearn.datasets import load_iris |
|
>>> X, y = load_iris(return_X_y=True) |
|
>>> clf = LogisticRegression(solver="liblinear").fit(X, y) |
|
>>> roc_auc_score(y, clf.predict_proba(X), multi_class='ovr') |
|
np.float64(0.99...) |
|
|
|
Multilabel case: |
|
|
|
>>> import numpy as np |
|
>>> from sklearn.datasets import make_multilabel_classification |
|
>>> from sklearn.multioutput import MultiOutputClassifier |
|
>>> X, y = make_multilabel_classification(random_state=0) |
|
>>> clf = MultiOutputClassifier(clf).fit(X, y) |
|
>>> # get a list of n_output containing probability arrays of shape |
|
>>> # (n_samples, n_classes) |
|
>>> y_pred = clf.predict_proba(X) |
|
>>> # extract the positive columns for each output |
|
>>> y_pred = np.transpose([pred[:, 1] for pred in y_pred]) |
|
>>> roc_auc_score(y, y_pred, average=None) |
|
array([0.82..., 0.86..., 0.94..., 0.85... , 0.94...]) |
|
>>> from sklearn.linear_model import RidgeClassifierCV |
|
>>> clf = RidgeClassifierCV().fit(X, y) |
|
>>> roc_auc_score(y, clf.decision_function(X), average=None) |
|
array([0.81..., 0.84... , 0.93..., 0.87..., 0.94...]) |
|
""" |
|
|
|
y_type = type_of_target(y_true, input_name="y_true") |
|
y_true = check_array(y_true, ensure_2d=False, dtype=None) |
|
y_score = check_array(y_score, ensure_2d=False) |
|
|
|
if y_type == "multiclass" or ( |
|
y_type == "binary" and y_score.ndim == 2 and y_score.shape[1] > 2 |
|
): |
|
|
|
if max_fpr is not None and max_fpr != 1.0: |
|
raise ValueError( |
|
"Partial AUC computation not available in " |
|
"multiclass setting, 'max_fpr' must be" |
|
" set to `None`, received `max_fpr={0}` " |
|
"instead".format(max_fpr) |
|
) |
|
if multi_class == "raise": |
|
raise ValueError("multi_class must be in ('ovo', 'ovr')") |
|
return _multiclass_roc_auc_score( |
|
y_true, y_score, labels, multi_class, average, sample_weight |
|
) |
|
elif y_type == "binary": |
|
labels = np.unique(y_true) |
|
y_true = label_binarize(y_true, classes=labels)[:, 0] |
|
return _average_binary_score( |
|
partial(_binary_roc_auc_score, max_fpr=max_fpr), |
|
y_true, |
|
y_score, |
|
average, |
|
sample_weight=sample_weight, |
|
) |
|
else: |
|
return _average_binary_score( |
|
partial(_binary_roc_auc_score, max_fpr=max_fpr), |
|
y_true, |
|
y_score, |
|
average, |
|
sample_weight=sample_weight, |
|
) |
|
|
|
|
|
def _multiclass_roc_auc_score( |
|
y_true, y_score, labels, multi_class, average, sample_weight |
|
): |
|
"""Multiclass roc auc score. |
|
|
|
Parameters |
|
---------- |
|
y_true : array-like of shape (n_samples,) |
|
True multiclass labels. |
|
|
|
y_score : array-like of shape (n_samples, n_classes) |
|
Target scores corresponding to probability estimates of a sample |
|
belonging to a particular class |
|
|
|
labels : array-like of shape (n_classes,) or None |
|
List of labels to index ``y_score`` used for multiclass. If ``None``, |
|
the lexical order of ``y_true`` is used to index ``y_score``. |
|
|
|
multi_class : {'ovr', 'ovo'} |
|
Determines the type of multiclass configuration to use. |
|
``'ovr'``: |
|
Calculate metrics for the multiclass case using the one-vs-rest |
|
approach. |
|
``'ovo'``: |
|
Calculate metrics for the multiclass case using the one-vs-one |
|
approach. |
|
|
|
average : {'micro', 'macro', 'weighted'} |
|
Determines the type of averaging performed on the pairwise binary |
|
metric scores |
|
``'micro'``: |
|
Calculate metrics for the binarized-raveled classes. Only supported |
|
for `multi_class='ovr'`. |
|
|
|
.. versionadded:: 1.2 |
|
|
|
``'macro'``: |
|
Calculate metrics for each label, and find their unweighted |
|
mean. This does not take label imbalance into account. Classes |
|
are assumed to be uniformly distributed. |
|
``'weighted'``: |
|
Calculate metrics for each label, taking into account the |
|
prevalence of the classes. |
|
|
|
sample_weight : array-like of shape (n_samples,) or None |
|
Sample weights. |
|
|
|
""" |
|
|
|
if not np.allclose(1, y_score.sum(axis=1)): |
|
raise ValueError( |
|
"Target scores need to be probabilities for multiclass " |
|
"roc_auc, i.e. they should sum up to 1.0 over classes" |
|
) |
|
|
|
|
|
average_options = ("macro", "weighted", None) |
|
if multi_class == "ovr": |
|
average_options = ("micro",) + average_options |
|
if average not in average_options: |
|
raise ValueError( |
|
"average must be one of {0} for multiclass problems".format(average_options) |
|
) |
|
|
|
multiclass_options = ("ovo", "ovr") |
|
if multi_class not in multiclass_options: |
|
raise ValueError( |
|
"multi_class='{0}' is not supported " |
|
"for multiclass ROC AUC, multi_class must be " |
|
"in {1}".format(multi_class, multiclass_options) |
|
) |
|
|
|
if average is None and multi_class == "ovo": |
|
raise NotImplementedError( |
|
"average=None is not implemented for multi_class='ovo'." |
|
) |
|
|
|
if labels is not None: |
|
labels = column_or_1d(labels) |
|
classes = _unique(labels) |
|
if len(classes) != len(labels): |
|
raise ValueError("Parameter 'labels' must be unique") |
|
if not np.array_equal(classes, labels): |
|
raise ValueError("Parameter 'labels' must be ordered") |
|
if len(classes) != y_score.shape[1]: |
|
raise ValueError( |
|
"Number of given labels, {0}, not equal to the number " |
|
"of columns in 'y_score', {1}".format(len(classes), y_score.shape[1]) |
|
) |
|
if len(np.setdiff1d(y_true, classes)): |
|
raise ValueError("'y_true' contains labels not in parameter 'labels'") |
|
else: |
|
classes = _unique(y_true) |
|
if len(classes) != y_score.shape[1]: |
|
raise ValueError( |
|
"Number of classes in y_true not equal to the number of " |
|
"columns in 'y_score'" |
|
) |
|
|
|
if multi_class == "ovo": |
|
if sample_weight is not None: |
|
raise ValueError( |
|
"sample_weight is not supported " |
|
"for multiclass one-vs-one ROC AUC, " |
|
"'sample_weight' must be None in this case." |
|
) |
|
y_true_encoded = _encode(y_true, uniques=classes) |
|
|
|
return _average_multiclass_ovo_score( |
|
_binary_roc_auc_score, y_true_encoded, y_score, average=average |
|
) |
|
else: |
|
|
|
y_true_multilabel = label_binarize(y_true, classes=classes) |
|
return _average_binary_score( |
|
_binary_roc_auc_score, |
|
y_true_multilabel, |
|
y_score, |
|
average, |
|
sample_weight=sample_weight, |
|
) |
|
|
|
|
|
def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None): |
|
"""Calculate true and false positives per binary classification threshold. |
|
|
|
Parameters |
|
---------- |
|
y_true : ndarray of shape (n_samples,) |
|
True targets of binary classification. |
|
|
|
y_score : ndarray of shape (n_samples,) |
|
Estimated probabilities or output of a decision function. |
|
|
|
pos_label : int, float, bool or str, default=None |
|
The label of the positive class. |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
Returns |
|
------- |
|
fps : ndarray of shape (n_thresholds,) |
|
A count of false positives, at index i being the number of negative |
|
samples assigned a score >= thresholds[i]. The total number of |
|
negative samples is equal to fps[-1] (thus true negatives are given by |
|
fps[-1] - fps). |
|
|
|
tps : ndarray of shape (n_thresholds,) |
|
An increasing count of true positives, at index i being the number |
|
of positive samples assigned a score >= thresholds[i]. The total |
|
number of positive samples is equal to tps[-1] (thus false negatives |
|
are given by tps[-1] - tps). |
|
|
|
thresholds : ndarray of shape (n_thresholds,) |
|
Decreasing score values. |
|
""" |
|
|
|
y_type = type_of_target(y_true, input_name="y_true") |
|
if not (y_type == "binary" or (y_type == "multiclass" and pos_label is not None)): |
|
raise ValueError("{0} format is not supported".format(y_type)) |
|
|
|
check_consistent_length(y_true, y_score, sample_weight) |
|
y_true = column_or_1d(y_true) |
|
y_score = column_or_1d(y_score) |
|
assert_all_finite(y_true) |
|
assert_all_finite(y_score) |
|
|
|
|
|
if sample_weight is not None: |
|
sample_weight = column_or_1d(sample_weight) |
|
sample_weight = _check_sample_weight(sample_weight, y_true) |
|
nonzero_weight_mask = sample_weight != 0 |
|
y_true = y_true[nonzero_weight_mask] |
|
y_score = y_score[nonzero_weight_mask] |
|
sample_weight = sample_weight[nonzero_weight_mask] |
|
|
|
pos_label = _check_pos_label_consistency(pos_label, y_true) |
|
|
|
|
|
y_true = y_true == pos_label |
|
|
|
|
|
desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1] |
|
y_score = y_score[desc_score_indices] |
|
y_true = y_true[desc_score_indices] |
|
if sample_weight is not None: |
|
weight = sample_weight[desc_score_indices] |
|
else: |
|
weight = 1.0 |
|
|
|
|
|
|
|
|
|
distinct_value_indices = np.where(np.diff(y_score))[0] |
|
threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1] |
|
|
|
|
|
tps = stable_cumsum(y_true * weight)[threshold_idxs] |
|
if sample_weight is not None: |
|
|
|
|
|
fps = stable_cumsum((1 - y_true) * weight)[threshold_idxs] |
|
else: |
|
fps = 1 + threshold_idxs - tps |
|
return fps, tps, y_score[threshold_idxs] |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like"], |
|
"y_score": ["array-like", Hidden(None)], |
|
"pos_label": [Real, str, "boolean", None], |
|
"sample_weight": ["array-like", None], |
|
"drop_intermediate": ["boolean"], |
|
"probas_pred": [ |
|
"array-like", |
|
Hidden(StrOptions({"deprecated"})), |
|
], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def precision_recall_curve( |
|
y_true, |
|
y_score=None, |
|
*, |
|
pos_label=None, |
|
sample_weight=None, |
|
drop_intermediate=False, |
|
probas_pred="deprecated", |
|
): |
|
"""Compute precision-recall pairs for different probability thresholds. |
|
|
|
Note: this implementation is restricted to the binary classification task. |
|
|
|
The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of |
|
true positives and ``fp`` the number of false positives. The precision is |
|
intuitively the ability of the classifier not to label as positive a sample |
|
that is negative. |
|
|
|
The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of |
|
true positives and ``fn`` the number of false negatives. The recall is |
|
intuitively the ability of the classifier to find all the positive samples. |
|
|
|
The last precision and recall values are 1. and 0. respectively and do not |
|
have a corresponding threshold. This ensures that the graph starts on the |
|
y axis. |
|
|
|
The first precision and recall values are precision=class balance and recall=1.0 |
|
which corresponds to a classifier that always predicts the positive class. |
|
|
|
Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`. |
|
|
|
Parameters |
|
---------- |
|
y_true : array-like of shape (n_samples,) |
|
True binary labels. If labels are not either {-1, 1} or {0, 1}, then |
|
pos_label should be explicitly given. |
|
|
|
y_score : array-like of shape (n_samples,) |
|
Target scores, can either be probability estimates of the positive |
|
class, or non-thresholded measure of decisions (as returned by |
|
`decision_function` on some classifiers). |
|
For :term:`decision_function` scores, values greater than or equal to |
|
zero should indicate the positive class. |
|
|
|
pos_label : int, float, bool or str, default=None |
|
The label of the positive class. |
|
When ``pos_label=None``, if y_true is in {-1, 1} or {0, 1}, |
|
``pos_label`` is set to 1, otherwise an error will be raised. |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
drop_intermediate : bool, default=False |
|
Whether to drop some suboptimal thresholds which would not appear |
|
on a plotted precision-recall curve. This is useful in order to create |
|
lighter precision-recall curves. |
|
|
|
.. versionadded:: 1.3 |
|
|
|
probas_pred : array-like of shape (n_samples,) |
|
Target scores, can either be probability estimates of the positive |
|
class, or non-thresholded measure of decisions (as returned by |
|
`decision_function` on some classifiers). |
|
|
|
.. deprecated:: 1.5 |
|
`probas_pred` is deprecated and will be removed in 1.7. Use |
|
`y_score` instead. |
|
|
|
Returns |
|
------- |
|
precision : ndarray of shape (n_thresholds + 1,) |
|
Precision values such that element i is the precision of |
|
predictions with score >= thresholds[i] and the last element is 1. |
|
|
|
recall : ndarray of shape (n_thresholds + 1,) |
|
Decreasing recall values such that element i is the recall of |
|
predictions with score >= thresholds[i] and the last element is 0. |
|
|
|
thresholds : ndarray of shape (n_thresholds,) |
|
Increasing thresholds on the decision function used to compute |
|
precision and recall where `n_thresholds = len(np.unique(probas_pred))`. |
|
|
|
See Also |
|
-------- |
|
PrecisionRecallDisplay.from_estimator : Plot Precision Recall Curve given |
|
a binary classifier. |
|
PrecisionRecallDisplay.from_predictions : Plot Precision Recall Curve |
|
using predictions from a binary classifier. |
|
average_precision_score : Compute average precision from prediction scores. |
|
det_curve: Compute error rates for different probability thresholds. |
|
roc_curve : Compute Receiver operating characteristic (ROC) curve. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.metrics import precision_recall_curve |
|
>>> y_true = np.array([0, 0, 1, 1]) |
|
>>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) |
|
>>> precision, recall, thresholds = precision_recall_curve( |
|
... y_true, y_scores) |
|
>>> precision |
|
array([0.5 , 0.66666667, 0.5 , 1. , 1. ]) |
|
>>> recall |
|
array([1. , 1. , 0.5, 0.5, 0. ]) |
|
>>> thresholds |
|
array([0.1 , 0.35, 0.4 , 0.8 ]) |
|
""" |
|
|
|
|
|
|
|
if y_score is not None and not isinstance(probas_pred, str): |
|
raise ValueError( |
|
"`probas_pred` and `y_score` cannot be both specified. Please use `y_score`" |
|
" only as `probas_pred` is deprecated in v1.5 and will be removed in v1.7." |
|
) |
|
if y_score is None: |
|
warnings.warn( |
|
( |
|
"probas_pred was deprecated in version 1.5 and will be removed in 1.7." |
|
"Please use ``y_score`` instead." |
|
), |
|
FutureWarning, |
|
) |
|
y_score = probas_pred |
|
|
|
fps, tps, thresholds = _binary_clf_curve( |
|
y_true, y_score, pos_label=pos_label, sample_weight=sample_weight |
|
) |
|
|
|
if drop_intermediate and len(fps) > 2: |
|
|
|
|
|
|
|
|
|
|
|
optimal_idxs = np.where( |
|
np.concatenate( |
|
[[True], np.logical_or(np.diff(tps[:-1]), np.diff(tps[1:])), [True]] |
|
) |
|
)[0] |
|
fps = fps[optimal_idxs] |
|
tps = tps[optimal_idxs] |
|
thresholds = thresholds[optimal_idxs] |
|
|
|
ps = tps + fps |
|
|
|
|
|
precision = np.zeros_like(tps) |
|
np.divide(tps, ps, out=precision, where=(ps != 0)) |
|
|
|
|
|
|
|
if tps[-1] == 0: |
|
warnings.warn( |
|
"No positive class found in y_true, " |
|
"recall is set to one for all thresholds." |
|
) |
|
recall = np.ones_like(tps) |
|
else: |
|
recall = tps / tps[-1] |
|
|
|
|
|
sl = slice(None, None, -1) |
|
return np.hstack((precision[sl], 1)), np.hstack((recall[sl], 0)), thresholds[sl] |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like"], |
|
"y_score": ["array-like"], |
|
"pos_label": [Real, str, "boolean", None], |
|
"sample_weight": ["array-like", None], |
|
"drop_intermediate": ["boolean"], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def roc_curve( |
|
y_true, y_score, *, pos_label=None, sample_weight=None, drop_intermediate=True |
|
): |
|
"""Compute Receiver operating characteristic (ROC). |
|
|
|
Note: this implementation is restricted to the binary classification task. |
|
|
|
Read more in the :ref:`User Guide <roc_metrics>`. |
|
|
|
Parameters |
|
---------- |
|
y_true : array-like of shape (n_samples,) |
|
True binary labels. If labels are not either {-1, 1} or {0, 1}, then |
|
pos_label should be explicitly given. |
|
|
|
y_score : array-like of shape (n_samples,) |
|
Target scores, can either be probability estimates of the positive |
|
class, confidence values, or non-thresholded measure of decisions |
|
(as returned by "decision_function" on some classifiers). |
|
For :term:`decision_function` scores, values greater than or equal to |
|
zero should indicate the positive class. |
|
|
|
pos_label : int, float, bool or str, default=None |
|
The label of the positive class. |
|
When ``pos_label=None``, if `y_true` is in {-1, 1} or {0, 1}, |
|
``pos_label`` is set to 1, otherwise an error will be raised. |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
drop_intermediate : bool, default=True |
|
Whether to drop some suboptimal thresholds which would not appear |
|
on a plotted ROC curve. This is useful in order to create lighter |
|
ROC curves. |
|
|
|
.. versionadded:: 0.17 |
|
parameter *drop_intermediate*. |
|
|
|
Returns |
|
------- |
|
fpr : ndarray of shape (>2,) |
|
Increasing false positive rates such that element i is the false |
|
positive rate of predictions with score >= `thresholds[i]`. |
|
|
|
tpr : ndarray of shape (>2,) |
|
Increasing true positive rates such that element `i` is the true |
|
positive rate of predictions with score >= `thresholds[i]`. |
|
|
|
thresholds : ndarray of shape (n_thresholds,) |
|
Decreasing thresholds on the decision function used to compute |
|
fpr and tpr. `thresholds[0]` represents no instances being predicted |
|
and is arbitrarily set to `np.inf`. |
|
|
|
See Also |
|
-------- |
|
RocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic |
|
(ROC) curve given an estimator and some data. |
|
RocCurveDisplay.from_predictions : Plot Receiver Operating Characteristic |
|
(ROC) curve given the true and predicted values. |
|
det_curve: Compute error rates for different probability thresholds. |
|
roc_auc_score : Compute the area under the ROC curve. |
|
|
|
Notes |
|
----- |
|
Since the thresholds are sorted from low to high values, they |
|
are reversed upon returning them to ensure they correspond to both ``fpr`` |
|
and ``tpr``, which are sorted in reversed order during their calculation. |
|
|
|
An arbitrary threshold is added for the case `tpr=0` and `fpr=0` to |
|
ensure that the curve starts at `(0, 0)`. This threshold corresponds to the |
|
`np.inf`. |
|
|
|
References |
|
---------- |
|
.. [1] `Wikipedia entry for the Receiver operating characteristic |
|
<https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_ |
|
|
|
.. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition |
|
Letters, 2006, 27(8):861-874. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn import metrics |
|
>>> y = np.array([1, 1, 2, 2]) |
|
>>> scores = np.array([0.1, 0.4, 0.35, 0.8]) |
|
>>> fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2) |
|
>>> fpr |
|
array([0. , 0. , 0.5, 0.5, 1. ]) |
|
>>> tpr |
|
array([0. , 0.5, 0.5, 1. , 1. ]) |
|
>>> thresholds |
|
array([ inf, 0.8 , 0.4 , 0.35, 0.1 ]) |
|
""" |
|
fps, tps, thresholds = _binary_clf_curve( |
|
y_true, y_score, pos_label=pos_label, sample_weight=sample_weight |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if drop_intermediate and len(fps) > 2: |
|
optimal_idxs = np.where( |
|
np.r_[True, np.logical_or(np.diff(fps, 2), np.diff(tps, 2)), True] |
|
)[0] |
|
fps = fps[optimal_idxs] |
|
tps = tps[optimal_idxs] |
|
thresholds = thresholds[optimal_idxs] |
|
|
|
|
|
|
|
tps = np.r_[0, tps] |
|
fps = np.r_[0, fps] |
|
|
|
thresholds = np.r_[np.inf, thresholds] |
|
|
|
if fps[-1] <= 0: |
|
warnings.warn( |
|
"No negative samples in y_true, false positive value should be meaningless", |
|
UndefinedMetricWarning, |
|
) |
|
fpr = np.repeat(np.nan, fps.shape) |
|
else: |
|
fpr = fps / fps[-1] |
|
|
|
if tps[-1] <= 0: |
|
warnings.warn( |
|
"No positive samples in y_true, true positive value should be meaningless", |
|
UndefinedMetricWarning, |
|
) |
|
tpr = np.repeat(np.nan, tps.shape) |
|
else: |
|
tpr = tps / tps[-1] |
|
|
|
return fpr, tpr, thresholds |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like", "sparse matrix"], |
|
"y_score": ["array-like"], |
|
"sample_weight": ["array-like", None], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def label_ranking_average_precision_score(y_true, y_score, *, sample_weight=None): |
|
"""Compute ranking-based average precision. |
|
|
|
Label ranking average precision (LRAP) is the average over each ground |
|
truth label assigned to each sample, of the ratio of true vs. total |
|
labels with lower score. |
|
|
|
This metric is used in multilabel ranking problem, where the goal |
|
is to give better rank to the labels associated to each sample. |
|
|
|
The obtained score is always strictly greater than 0 and |
|
the best value is 1. |
|
|
|
Read more in the :ref:`User Guide <label_ranking_average_precision>`. |
|
|
|
Parameters |
|
---------- |
|
y_true : {array-like, sparse matrix} of shape (n_samples, n_labels) |
|
True binary labels in binary indicator format. |
|
|
|
y_score : array-like of shape (n_samples, n_labels) |
|
Target scores, can either be probability estimates of the positive |
|
class, confidence values, or non-thresholded measure of decisions |
|
(as returned by "decision_function" on some classifiers). |
|
For :term:`decision_function` scores, values greater than or equal to |
|
zero should indicate the positive class. |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
.. versionadded:: 0.20 |
|
|
|
Returns |
|
------- |
|
score : float |
|
Ranking-based average precision score. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.metrics import label_ranking_average_precision_score |
|
>>> y_true = np.array([[1, 0, 0], [0, 0, 1]]) |
|
>>> y_score = np.array([[0.75, 0.5, 1], [1, 0.2, 0.1]]) |
|
>>> label_ranking_average_precision_score(y_true, y_score) |
|
np.float64(0.416...) |
|
""" |
|
check_consistent_length(y_true, y_score, sample_weight) |
|
y_true = check_array(y_true, ensure_2d=False, accept_sparse="csr") |
|
y_score = check_array(y_score, ensure_2d=False) |
|
|
|
if y_true.shape != y_score.shape: |
|
raise ValueError("y_true and y_score have different shape") |
|
|
|
|
|
y_type = type_of_target(y_true, input_name="y_true") |
|
if y_type != "multilabel-indicator" and not ( |
|
y_type == "binary" and y_true.ndim == 2 |
|
): |
|
raise ValueError("{0} format is not supported".format(y_type)) |
|
|
|
if not issparse(y_true): |
|
y_true = csr_matrix(y_true) |
|
|
|
y_score = -y_score |
|
|
|
n_samples, n_labels = y_true.shape |
|
|
|
out = 0.0 |
|
for i, (start, stop) in enumerate(zip(y_true.indptr, y_true.indptr[1:])): |
|
relevant = y_true.indices[start:stop] |
|
|
|
if relevant.size == 0 or relevant.size == n_labels: |
|
|
|
|
|
aux = 1.0 |
|
else: |
|
scores_i = y_score[i] |
|
rank = rankdata(scores_i, "max")[relevant] |
|
L = rankdata(scores_i[relevant], "max") |
|
aux = (L / rank).mean() |
|
|
|
if sample_weight is not None: |
|
aux = aux * sample_weight[i] |
|
out += aux |
|
|
|
if sample_weight is None: |
|
out /= n_samples |
|
else: |
|
out /= np.sum(sample_weight) |
|
|
|
return out |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like"], |
|
"y_score": ["array-like"], |
|
"sample_weight": ["array-like", None], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def coverage_error(y_true, y_score, *, sample_weight=None): |
|
"""Coverage error measure. |
|
|
|
Compute how far we need to go through the ranked scores to cover all |
|
true labels. The best value is equal to the average number |
|
of labels in ``y_true`` per sample. |
|
|
|
Ties in ``y_scores`` are broken by giving maximal rank that would have |
|
been assigned to all tied values. |
|
|
|
Note: Our implementation's score is 1 greater than the one given in |
|
Tsoumakas et al., 2010. This extends it to handle the degenerate case |
|
in which an instance has 0 true labels. |
|
|
|
Read more in the :ref:`User Guide <coverage_error>`. |
|
|
|
Parameters |
|
---------- |
|
y_true : array-like of shape (n_samples, n_labels) |
|
True binary labels in binary indicator format. |
|
|
|
y_score : array-like of shape (n_samples, n_labels) |
|
Target scores, can either be probability estimates of the positive |
|
class, confidence values, or non-thresholded measure of decisions |
|
(as returned by "decision_function" on some classifiers). |
|
For :term:`decision_function` scores, values greater than or equal to |
|
zero should indicate the positive class. |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
Returns |
|
------- |
|
coverage_error : float |
|
The coverage error. |
|
|
|
References |
|
---------- |
|
.. [1] Tsoumakas, G., Katakis, I., & Vlahavas, I. (2010). |
|
Mining multi-label data. In Data mining and knowledge discovery |
|
handbook (pp. 667-685). Springer US. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics import coverage_error |
|
>>> y_true = [[1, 0, 0], [0, 1, 1]] |
|
>>> y_score = [[1, 0, 0], [0, 1, 1]] |
|
>>> coverage_error(y_true, y_score) |
|
np.float64(1.5) |
|
""" |
|
y_true = check_array(y_true, ensure_2d=True) |
|
y_score = check_array(y_score, ensure_2d=True) |
|
check_consistent_length(y_true, y_score, sample_weight) |
|
|
|
y_type = type_of_target(y_true, input_name="y_true") |
|
if y_type != "multilabel-indicator": |
|
raise ValueError("{0} format is not supported".format(y_type)) |
|
|
|
if y_true.shape != y_score.shape: |
|
raise ValueError("y_true and y_score have different shape") |
|
|
|
y_score_mask = np.ma.masked_array(y_score, mask=np.logical_not(y_true)) |
|
y_min_relevant = y_score_mask.min(axis=1).reshape((-1, 1)) |
|
coverage = (y_score >= y_min_relevant).sum(axis=1) |
|
coverage = coverage.filled(0) |
|
|
|
return np.average(coverage, weights=sample_weight) |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like", "sparse matrix"], |
|
"y_score": ["array-like"], |
|
"sample_weight": ["array-like", None], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def label_ranking_loss(y_true, y_score, *, sample_weight=None): |
|
"""Compute Ranking loss measure. |
|
|
|
Compute the average number of label pairs that are incorrectly ordered |
|
given y_score weighted by the size of the label set and the number of |
|
labels not in the label set. |
|
|
|
This is similar to the error set size, but weighted by the number of |
|
relevant and irrelevant labels. The best performance is achieved with |
|
a ranking loss of zero. |
|
|
|
Read more in the :ref:`User Guide <label_ranking_loss>`. |
|
|
|
.. versionadded:: 0.17 |
|
A function *label_ranking_loss* |
|
|
|
Parameters |
|
---------- |
|
y_true : {array-like, sparse matrix} of shape (n_samples, n_labels) |
|
True binary labels in binary indicator format. |
|
|
|
y_score : array-like of shape (n_samples, n_labels) |
|
Target scores, can either be probability estimates of the positive |
|
class, confidence values, or non-thresholded measure of decisions |
|
(as returned by "decision_function" on some classifiers). |
|
For :term:`decision_function` scores, values greater than or equal to |
|
zero should indicate the positive class. |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
Returns |
|
------- |
|
loss : float |
|
Average number of label pairs that are incorrectly ordered given |
|
y_score weighted by the size of the label set and the number of labels not |
|
in the label set. |
|
|
|
References |
|
---------- |
|
.. [1] Tsoumakas, G., Katakis, I., & Vlahavas, I. (2010). |
|
Mining multi-label data. In Data mining and knowledge discovery |
|
handbook (pp. 667-685). Springer US. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics import label_ranking_loss |
|
>>> y_true = [[1, 0, 0], [0, 0, 1]] |
|
>>> y_score = [[0.75, 0.5, 1], [1, 0.2, 0.1]] |
|
>>> label_ranking_loss(y_true, y_score) |
|
np.float64(0.75...) |
|
""" |
|
y_true = check_array(y_true, ensure_2d=False, accept_sparse="csr") |
|
y_score = check_array(y_score, ensure_2d=False) |
|
check_consistent_length(y_true, y_score, sample_weight) |
|
|
|
y_type = type_of_target(y_true, input_name="y_true") |
|
if y_type not in ("multilabel-indicator",): |
|
raise ValueError("{0} format is not supported".format(y_type)) |
|
|
|
if y_true.shape != y_score.shape: |
|
raise ValueError("y_true and y_score have different shape") |
|
|
|
n_samples, n_labels = y_true.shape |
|
|
|
y_true = csr_matrix(y_true) |
|
|
|
loss = np.zeros(n_samples) |
|
for i, (start, stop) in enumerate(zip(y_true.indptr, y_true.indptr[1:])): |
|
|
|
unique_scores, unique_inverse = np.unique(y_score[i], return_inverse=True) |
|
true_at_reversed_rank = np.bincount( |
|
unique_inverse[y_true.indices[start:stop]], minlength=len(unique_scores) |
|
) |
|
all_at_reversed_rank = np.bincount(unique_inverse, minlength=len(unique_scores)) |
|
false_at_reversed_rank = all_at_reversed_rank - true_at_reversed_rank |
|
|
|
|
|
|
|
|
|
|
|
loss[i] = np.dot(true_at_reversed_rank.cumsum(), false_at_reversed_rank) |
|
|
|
n_positives = count_nonzero(y_true, axis=1) |
|
with np.errstate(divide="ignore", invalid="ignore"): |
|
loss /= (n_labels - n_positives) * n_positives |
|
|
|
|
|
|
|
loss[np.logical_or(n_positives == 0, n_positives == n_labels)] = 0.0 |
|
|
|
return np.average(loss, weights=sample_weight) |
|
|
|
|
|
def _dcg_sample_scores(y_true, y_score, k=None, log_base=2, ignore_ties=False): |
|
"""Compute Discounted Cumulative Gain. |
|
|
|
Sum the true scores ranked in the order induced by the predicted scores, |
|
after applying a logarithmic discount. |
|
|
|
This ranking metric yields a high value if true labels are ranked high by |
|
``y_score``. |
|
|
|
Parameters |
|
---------- |
|
y_true : ndarray of shape (n_samples, n_labels) |
|
True targets of multilabel classification, or true scores of entities |
|
to be ranked. |
|
|
|
y_score : ndarray of shape (n_samples, n_labels) |
|
Target scores, can either be probability estimates, confidence values, |
|
or non-thresholded measure of decisions (as returned by |
|
"decision_function" on some classifiers). |
|
|
|
k : int, default=None |
|
Only consider the highest k scores in the ranking. If `None`, use all |
|
outputs. |
|
|
|
log_base : float, default=2 |
|
Base of the logarithm used for the discount. A low value means a |
|
sharper discount (top results are more important). |
|
|
|
ignore_ties : bool, default=False |
|
Assume that there are no ties in y_score (which is likely to be the |
|
case if y_score is continuous) for efficiency gains. |
|
|
|
Returns |
|
------- |
|
discounted_cumulative_gain : ndarray of shape (n_samples,) |
|
The DCG score for each sample. |
|
|
|
See Also |
|
-------- |
|
ndcg_score : The Discounted Cumulative Gain divided by the Ideal Discounted |
|
Cumulative Gain (the DCG obtained for a perfect ranking), in order to |
|
have a score between 0 and 1. |
|
""" |
|
discount = 1 / (np.log(np.arange(y_true.shape[1]) + 2) / np.log(log_base)) |
|
if k is not None: |
|
discount[k:] = 0 |
|
if ignore_ties: |
|
ranking = np.argsort(y_score)[:, ::-1] |
|
ranked = y_true[np.arange(ranking.shape[0])[:, np.newaxis], ranking] |
|
cumulative_gains = discount.dot(ranked.T) |
|
else: |
|
discount_cumsum = np.cumsum(discount) |
|
cumulative_gains = [ |
|
_tie_averaged_dcg(y_t, y_s, discount_cumsum) |
|
for y_t, y_s in zip(y_true, y_score) |
|
] |
|
cumulative_gains = np.asarray(cumulative_gains) |
|
return cumulative_gains |
|
|
|
|
|
def _tie_averaged_dcg(y_true, y_score, discount_cumsum): |
|
""" |
|
Compute DCG by averaging over possible permutations of ties. |
|
|
|
The gain (`y_true`) of an index falling inside a tied group (in the order |
|
induced by `y_score`) is replaced by the average gain within this group. |
|
The discounted gain for a tied group is then the average `y_true` within |
|
this group times the sum of discounts of the corresponding ranks. |
|
|
|
This amounts to averaging scores for all possible orderings of the tied |
|
groups. |
|
|
|
(note in the case of dcg@k the discount is 0 after index k) |
|
|
|
Parameters |
|
---------- |
|
y_true : ndarray |
|
The true relevance scores. |
|
|
|
y_score : ndarray |
|
Predicted scores. |
|
|
|
discount_cumsum : ndarray |
|
Precomputed cumulative sum of the discounts. |
|
|
|
Returns |
|
------- |
|
discounted_cumulative_gain : float |
|
The discounted cumulative gain. |
|
|
|
References |
|
---------- |
|
McSherry, F., & Najork, M. (2008, March). Computing information retrieval |
|
performance measures efficiently in the presence of tied scores. In |
|
European conference on information retrieval (pp. 414-421). Springer, |
|
Berlin, Heidelberg. |
|
""" |
|
_, inv, counts = np.unique(-y_score, return_inverse=True, return_counts=True) |
|
ranked = np.zeros(len(counts)) |
|
np.add.at(ranked, inv, y_true) |
|
ranked /= counts |
|
groups = np.cumsum(counts) - 1 |
|
discount_sums = np.empty(len(counts)) |
|
discount_sums[0] = discount_cumsum[groups[0]] |
|
discount_sums[1:] = np.diff(discount_cumsum[groups]) |
|
return (ranked * discount_sums).sum() |
|
|
|
|
|
def _check_dcg_target_type(y_true): |
|
y_type = type_of_target(y_true, input_name="y_true") |
|
supported_fmt = ( |
|
"multilabel-indicator", |
|
"continuous-multioutput", |
|
"multiclass-multioutput", |
|
) |
|
if y_type not in supported_fmt: |
|
raise ValueError( |
|
"Only {} formats are supported. Got {} instead".format( |
|
supported_fmt, y_type |
|
) |
|
) |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like"], |
|
"y_score": ["array-like"], |
|
"k": [Interval(Integral, 1, None, closed="left"), None], |
|
"log_base": [Interval(Real, 0.0, None, closed="neither")], |
|
"sample_weight": ["array-like", None], |
|
"ignore_ties": ["boolean"], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def dcg_score( |
|
y_true, y_score, *, k=None, log_base=2, sample_weight=None, ignore_ties=False |
|
): |
|
"""Compute Discounted Cumulative Gain. |
|
|
|
Sum the true scores ranked in the order induced by the predicted scores, |
|
after applying a logarithmic discount. |
|
|
|
This ranking metric yields a high value if true labels are ranked high by |
|
``y_score``. |
|
|
|
Usually the Normalized Discounted Cumulative Gain (NDCG, computed by |
|
ndcg_score) is preferred. |
|
|
|
Parameters |
|
---------- |
|
y_true : array-like of shape (n_samples, n_labels) |
|
True targets of multilabel classification, or true scores of entities |
|
to be ranked. |
|
|
|
y_score : array-like of shape (n_samples, n_labels) |
|
Target scores, can either be probability estimates, confidence values, |
|
or non-thresholded measure of decisions (as returned by |
|
"decision_function" on some classifiers). |
|
|
|
k : int, default=None |
|
Only consider the highest k scores in the ranking. If None, use all |
|
outputs. |
|
|
|
log_base : float, default=2 |
|
Base of the logarithm used for the discount. A low value means a |
|
sharper discount (top results are more important). |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. If `None`, all samples are given the same weight. |
|
|
|
ignore_ties : bool, default=False |
|
Assume that there are no ties in y_score (which is likely to be the |
|
case if y_score is continuous) for efficiency gains. |
|
|
|
Returns |
|
------- |
|
discounted_cumulative_gain : float |
|
The averaged sample DCG scores. |
|
|
|
See Also |
|
-------- |
|
ndcg_score : The Discounted Cumulative Gain divided by the Ideal Discounted |
|
Cumulative Gain (the DCG obtained for a perfect ranking), in order to |
|
have a score between 0 and 1. |
|
|
|
References |
|
---------- |
|
`Wikipedia entry for Discounted Cumulative Gain |
|
<https://en.wikipedia.org/wiki/Discounted_cumulative_gain>`_. |
|
|
|
Jarvelin, K., & Kekalainen, J. (2002). |
|
Cumulated gain-based evaluation of IR techniques. ACM Transactions on |
|
Information Systems (TOIS), 20(4), 422-446. |
|
|
|
Wang, Y., Wang, L., Li, Y., He, D., Chen, W., & Liu, T. Y. (2013, May). |
|
A theoretical analysis of NDCG ranking measures. In Proceedings of the 26th |
|
Annual Conference on Learning Theory (COLT 2013). |
|
|
|
McSherry, F., & Najork, M. (2008, March). Computing information retrieval |
|
performance measures efficiently in the presence of tied scores. In |
|
European conference on information retrieval (pp. 414-421). Springer, |
|
Berlin, Heidelberg. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.metrics import dcg_score |
|
>>> # we have ground-truth relevance of some answers to a query: |
|
>>> true_relevance = np.asarray([[10, 0, 0, 1, 5]]) |
|
>>> # we predict scores for the answers |
|
>>> scores = np.asarray([[.1, .2, .3, 4, 70]]) |
|
>>> dcg_score(true_relevance, scores) |
|
np.float64(9.49...) |
|
>>> # we can set k to truncate the sum; only top k answers contribute |
|
>>> dcg_score(true_relevance, scores, k=2) |
|
np.float64(5.63...) |
|
>>> # now we have some ties in our prediction |
|
>>> scores = np.asarray([[1, 0, 0, 0, 1]]) |
|
>>> # by default ties are averaged, so here we get the average true |
|
>>> # relevance of our top predictions: (10 + 5) / 2 = 7.5 |
|
>>> dcg_score(true_relevance, scores, k=1) |
|
np.float64(7.5) |
|
>>> # we can choose to ignore ties for faster results, but only |
|
>>> # if we know there aren't ties in our scores, otherwise we get |
|
>>> # wrong results: |
|
>>> dcg_score(true_relevance, |
|
... scores, k=1, ignore_ties=True) |
|
np.float64(5.0) |
|
""" |
|
y_true = check_array(y_true, ensure_2d=False) |
|
y_score = check_array(y_score, ensure_2d=False) |
|
check_consistent_length(y_true, y_score, sample_weight) |
|
_check_dcg_target_type(y_true) |
|
return np.average( |
|
_dcg_sample_scores( |
|
y_true, y_score, k=k, log_base=log_base, ignore_ties=ignore_ties |
|
), |
|
weights=sample_weight, |
|
) |
|
|
|
|
|
def _ndcg_sample_scores(y_true, y_score, k=None, ignore_ties=False): |
|
"""Compute Normalized Discounted Cumulative Gain. |
|
|
|
Sum the true scores ranked in the order induced by the predicted scores, |
|
after applying a logarithmic discount. Then divide by the best possible |
|
score (Ideal DCG, obtained for a perfect ranking) to obtain a score between |
|
0 and 1. |
|
|
|
This ranking metric yields a high value if true labels are ranked high by |
|
``y_score``. |
|
|
|
Parameters |
|
---------- |
|
y_true : ndarray of shape (n_samples, n_labels) |
|
True targets of multilabel classification, or true scores of entities |
|
to be ranked. |
|
|
|
y_score : ndarray of shape (n_samples, n_labels) |
|
Target scores, can either be probability estimates, confidence values, |
|
or non-thresholded measure of decisions (as returned by |
|
"decision_function" on some classifiers). |
|
|
|
k : int, default=None |
|
Only consider the highest k scores in the ranking. If None, use all |
|
outputs. |
|
|
|
ignore_ties : bool, default=False |
|
Assume that there are no ties in y_score (which is likely to be the |
|
case if y_score is continuous) for efficiency gains. |
|
|
|
Returns |
|
------- |
|
normalized_discounted_cumulative_gain : ndarray of shape (n_samples,) |
|
The NDCG score for each sample (float in [0., 1.]). |
|
|
|
See Also |
|
-------- |
|
dcg_score : Discounted Cumulative Gain (not normalized). |
|
|
|
""" |
|
gain = _dcg_sample_scores(y_true, y_score, k, ignore_ties=ignore_ties) |
|
|
|
|
|
|
|
normalizing_gain = _dcg_sample_scores(y_true, y_true, k, ignore_ties=True) |
|
all_irrelevant = normalizing_gain == 0 |
|
gain[all_irrelevant] = 0 |
|
gain[~all_irrelevant] /= normalizing_gain[~all_irrelevant] |
|
return gain |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like"], |
|
"y_score": ["array-like"], |
|
"k": [Interval(Integral, 1, None, closed="left"), None], |
|
"sample_weight": ["array-like", None], |
|
"ignore_ties": ["boolean"], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def ndcg_score(y_true, y_score, *, k=None, sample_weight=None, ignore_ties=False): |
|
"""Compute Normalized Discounted Cumulative Gain. |
|
|
|
Sum the true scores ranked in the order induced by the predicted scores, |
|
after applying a logarithmic discount. Then divide by the best possible |
|
score (Ideal DCG, obtained for a perfect ranking) to obtain a score between |
|
0 and 1. |
|
|
|
This ranking metric returns a high value if true labels are ranked high by |
|
``y_score``. |
|
|
|
Parameters |
|
---------- |
|
y_true : array-like of shape (n_samples, n_labels) |
|
True targets of multilabel classification, or true scores of entities |
|
to be ranked. Negative values in `y_true` may result in an output |
|
that is not between 0 and 1. |
|
|
|
y_score : array-like of shape (n_samples, n_labels) |
|
Target scores, can either be probability estimates, confidence values, |
|
or non-thresholded measure of decisions (as returned by |
|
"decision_function" on some classifiers). |
|
|
|
k : int, default=None |
|
Only consider the highest k scores in the ranking. If `None`, use all |
|
outputs. |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. If `None`, all samples are given the same weight. |
|
|
|
ignore_ties : bool, default=False |
|
Assume that there are no ties in y_score (which is likely to be the |
|
case if y_score is continuous) for efficiency gains. |
|
|
|
Returns |
|
------- |
|
normalized_discounted_cumulative_gain : float in [0., 1.] |
|
The averaged NDCG scores for all samples. |
|
|
|
See Also |
|
-------- |
|
dcg_score : Discounted Cumulative Gain (not normalized). |
|
|
|
References |
|
---------- |
|
`Wikipedia entry for Discounted Cumulative Gain |
|
<https://en.wikipedia.org/wiki/Discounted_cumulative_gain>`_ |
|
|
|
Jarvelin, K., & Kekalainen, J. (2002). |
|
Cumulated gain-based evaluation of IR techniques. ACM Transactions on |
|
Information Systems (TOIS), 20(4), 422-446. |
|
|
|
Wang, Y., Wang, L., Li, Y., He, D., Chen, W., & Liu, T. Y. (2013, May). |
|
A theoretical analysis of NDCG ranking measures. In Proceedings of the 26th |
|
Annual Conference on Learning Theory (COLT 2013) |
|
|
|
McSherry, F., & Najork, M. (2008, March). Computing information retrieval |
|
performance measures efficiently in the presence of tied scores. In |
|
European conference on information retrieval (pp. 414-421). Springer, |
|
Berlin, Heidelberg. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.metrics import ndcg_score |
|
>>> # we have ground-truth relevance of some answers to a query: |
|
>>> true_relevance = np.asarray([[10, 0, 0, 1, 5]]) |
|
>>> # we predict some scores (relevance) for the answers |
|
>>> scores = np.asarray([[.1, .2, .3, 4, 70]]) |
|
>>> ndcg_score(true_relevance, scores) |
|
np.float64(0.69...) |
|
>>> scores = np.asarray([[.05, 1.1, 1., .5, .0]]) |
|
>>> ndcg_score(true_relevance, scores) |
|
np.float64(0.49...) |
|
>>> # we can set k to truncate the sum; only top k answers contribute. |
|
>>> ndcg_score(true_relevance, scores, k=4) |
|
np.float64(0.35...) |
|
>>> # the normalization takes k into account so a perfect answer |
|
>>> # would still get 1.0 |
|
>>> ndcg_score(true_relevance, true_relevance, k=4) |
|
np.float64(1.0...) |
|
>>> # now we have some ties in our prediction |
|
>>> scores = np.asarray([[1, 0, 0, 0, 1]]) |
|
>>> # by default ties are averaged, so here we get the average (normalized) |
|
>>> # true relevance of our top predictions: (10 / 10 + 5 / 10) / 2 = .75 |
|
>>> ndcg_score(true_relevance, scores, k=1) |
|
np.float64(0.75...) |
|
>>> # we can choose to ignore ties for faster results, but only |
|
>>> # if we know there aren't ties in our scores, otherwise we get |
|
>>> # wrong results: |
|
>>> ndcg_score(true_relevance, |
|
... scores, k=1, ignore_ties=True) |
|
np.float64(0.5...) |
|
""" |
|
y_true = check_array(y_true, ensure_2d=False) |
|
y_score = check_array(y_score, ensure_2d=False) |
|
check_consistent_length(y_true, y_score, sample_weight) |
|
|
|
if y_true.min() < 0: |
|
raise ValueError("ndcg_score should not be used on negative y_true values.") |
|
if y_true.ndim > 1 and y_true.shape[1] <= 1: |
|
raise ValueError( |
|
"Computing NDCG is only meaningful when there is more than 1 document. " |
|
f"Got {y_true.shape[1]} instead." |
|
) |
|
_check_dcg_target_type(y_true) |
|
gain = _ndcg_sample_scores(y_true, y_score, k=k, ignore_ties=ignore_ties) |
|
return np.average(gain, weights=sample_weight) |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like"], |
|
"y_score": ["array-like"], |
|
"k": [Interval(Integral, 1, None, closed="left")], |
|
"normalize": ["boolean"], |
|
"sample_weight": ["array-like", None], |
|
"labels": ["array-like", None], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def top_k_accuracy_score( |
|
y_true, y_score, *, k=2, normalize=True, sample_weight=None, labels=None |
|
): |
|
"""Top-k Accuracy classification score. |
|
|
|
This metric computes the number of times where the correct label is among |
|
the top `k` labels predicted (ranked by predicted scores). Note that the |
|
multilabel case isn't covered here. |
|
|
|
Read more in the :ref:`User Guide <top_k_accuracy_score>` |
|
|
|
Parameters |
|
---------- |
|
y_true : array-like of shape (n_samples,) |
|
True labels. |
|
|
|
y_score : array-like of shape (n_samples,) or (n_samples, n_classes) |
|
Target scores. These can be either probability estimates or |
|
non-thresholded decision values (as returned by |
|
:term:`decision_function` on some classifiers). |
|
The binary case expects scores with shape (n_samples,) while the |
|
multiclass case expects scores with shape (n_samples, n_classes). |
|
In the multiclass case, the order of the class scores must |
|
correspond to the order of ``labels``, if provided, or else to |
|
the numerical or lexicographical order of the labels in ``y_true``. |
|
If ``y_true`` does not contain all the labels, ``labels`` must be |
|
provided. |
|
|
|
k : int, default=2 |
|
Number of most likely outcomes considered to find the correct label. |
|
|
|
normalize : bool, default=True |
|
If `True`, return the fraction of correctly classified samples. |
|
Otherwise, return the number of correctly classified samples. |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. If `None`, all samples are given the same weight. |
|
|
|
labels : array-like of shape (n_classes,), default=None |
|
Multiclass only. List of labels that index the classes in ``y_score``. |
|
If ``None``, the numerical or lexicographical order of the labels in |
|
``y_true`` is used. If ``y_true`` does not contain all the labels, |
|
``labels`` must be provided. |
|
|
|
Returns |
|
------- |
|
score : float |
|
The top-k accuracy score. The best performance is 1 with |
|
`normalize == True` and the number of samples with |
|
`normalize == False`. |
|
|
|
See Also |
|
-------- |
|
accuracy_score : Compute the accuracy score. By default, the function will |
|
return the fraction of correct predictions divided by the total number |
|
of predictions. |
|
|
|
Notes |
|
----- |
|
In cases where two or more labels are assigned equal predicted scores, |
|
the labels with the highest indices will be chosen first. This might |
|
impact the result if the correct label falls after the threshold because |
|
of that. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.metrics import top_k_accuracy_score |
|
>>> y_true = np.array([0, 1, 2, 2]) |
|
>>> y_score = np.array([[0.5, 0.2, 0.2], # 0 is in top 2 |
|
... [0.3, 0.4, 0.2], # 1 is in top 2 |
|
... [0.2, 0.4, 0.3], # 2 is in top 2 |
|
... [0.7, 0.2, 0.1]]) # 2 isn't in top 2 |
|
>>> top_k_accuracy_score(y_true, y_score, k=2) |
|
np.float64(0.75) |
|
>>> # Not normalizing gives the number of "correctly" classified samples |
|
>>> top_k_accuracy_score(y_true, y_score, k=2, normalize=False) |
|
np.int64(3) |
|
""" |
|
y_true = check_array(y_true, ensure_2d=False, dtype=None) |
|
y_true = column_or_1d(y_true) |
|
y_type = type_of_target(y_true, input_name="y_true") |
|
if y_type == "binary" and labels is not None and len(labels) > 2: |
|
y_type = "multiclass" |
|
if y_type not in {"binary", "multiclass"}: |
|
raise ValueError( |
|
f"y type must be 'binary' or 'multiclass', got '{y_type}' instead." |
|
) |
|
y_score = check_array(y_score, ensure_2d=False) |
|
if y_type == "binary": |
|
if y_score.ndim == 2 and y_score.shape[1] != 1: |
|
raise ValueError( |
|
"`y_true` is binary while y_score is 2d with" |
|
f" {y_score.shape[1]} classes. If `y_true` does not contain all the" |
|
" labels, `labels` must be provided." |
|
) |
|
y_score = column_or_1d(y_score) |
|
|
|
check_consistent_length(y_true, y_score, sample_weight) |
|
y_score_n_classes = y_score.shape[1] if y_score.ndim == 2 else 2 |
|
|
|
if labels is None: |
|
classes = _unique(y_true) |
|
n_classes = len(classes) |
|
|
|
if n_classes != y_score_n_classes: |
|
raise ValueError( |
|
f"Number of classes in 'y_true' ({n_classes}) not equal " |
|
f"to the number of classes in 'y_score' ({y_score_n_classes})." |
|
"You can provide a list of all known classes by assigning it " |
|
"to the `labels` parameter." |
|
) |
|
else: |
|
labels = column_or_1d(labels) |
|
classes = _unique(labels) |
|
n_labels = len(labels) |
|
n_classes = len(classes) |
|
|
|
if n_classes != n_labels: |
|
raise ValueError("Parameter 'labels' must be unique.") |
|
|
|
if not np.array_equal(classes, labels): |
|
raise ValueError("Parameter 'labels' must be ordered.") |
|
|
|
if n_classes != y_score_n_classes: |
|
raise ValueError( |
|
f"Number of given labels ({n_classes}) not equal to the " |
|
f"number of classes in 'y_score' ({y_score_n_classes})." |
|
) |
|
|
|
if len(np.setdiff1d(y_true, classes)): |
|
raise ValueError("'y_true' contains labels not in parameter 'labels'.") |
|
|
|
if k >= n_classes: |
|
warnings.warn( |
|
( |
|
f"'k' ({k}) greater than or equal to 'n_classes' ({n_classes}) " |
|
"will result in a perfect score and is therefore meaningless." |
|
), |
|
UndefinedMetricWarning, |
|
) |
|
|
|
y_true_encoded = _encode(y_true, uniques=classes) |
|
|
|
if y_type == "binary": |
|
if k == 1: |
|
threshold = 0.5 if y_score.min() >= 0 and y_score.max() <= 1 else 0 |
|
y_pred = (y_score > threshold).astype(np.int64) |
|
hits = y_pred == y_true_encoded |
|
else: |
|
hits = np.ones_like(y_score, dtype=np.bool_) |
|
elif y_type == "multiclass": |
|
sorted_pred = np.argsort(y_score, axis=1, kind="mergesort")[:, ::-1] |
|
hits = (y_true_encoded == sorted_pred[:, :k].T).any(axis=0) |
|
|
|
if normalize: |
|
return np.average(hits, weights=sample_weight) |
|
elif sample_weight is None: |
|
return np.sum(hits) |
|
else: |
|
return np.dot(hits, sample_weight) |
|
|