|
"""Metrics to assess performance on classification task given class prediction. |
|
|
|
Functions named as ``*_score`` return a scalar value to maximize: the higher |
|
the better. |
|
|
|
Function named as ``*_error`` or ``*_loss`` return a scalar value to minimize: |
|
the lower the better. |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
import warnings |
|
from numbers import Integral, Real |
|
|
|
import numpy as np |
|
from scipy.sparse import coo_matrix, csr_matrix, issparse |
|
from scipy.special import xlogy |
|
|
|
from ..exceptions import UndefinedMetricWarning |
|
from ..preprocessing import LabelBinarizer, LabelEncoder |
|
from ..utils import ( |
|
assert_all_finite, |
|
check_array, |
|
check_consistent_length, |
|
column_or_1d, |
|
) |
|
from ..utils._array_api import ( |
|
_average, |
|
_bincount, |
|
_count_nonzero, |
|
_find_matching_floating_dtype, |
|
_is_numpy_namespace, |
|
_searchsorted, |
|
_setdiff1d, |
|
_tolist, |
|
_union1d, |
|
device, |
|
get_namespace, |
|
get_namespace_and_device, |
|
) |
|
from ..utils._param_validation import ( |
|
Hidden, |
|
Interval, |
|
Options, |
|
StrOptions, |
|
validate_params, |
|
) |
|
from ..utils._unique import attach_unique |
|
from ..utils.extmath import _nanaverage |
|
from ..utils.multiclass import type_of_target, unique_labels |
|
from ..utils.sparsefuncs import count_nonzero |
|
from ..utils.validation import ( |
|
_check_pos_label_consistency, |
|
_check_sample_weight, |
|
_num_samples, |
|
) |
|
|
|
|
|
def _check_zero_division(zero_division): |
|
if isinstance(zero_division, str) and zero_division == "warn": |
|
return np.float64(0.0) |
|
elif isinstance(zero_division, (int, float)) and zero_division in [0, 1]: |
|
return np.float64(zero_division) |
|
else: |
|
return np.nan |
|
|
|
|
|
def _check_targets(y_true, y_pred): |
|
"""Check that y_true and y_pred belong to the same classification task. |
|
|
|
This converts multiclass or binary types to a common shape, and raises a |
|
ValueError for a mix of multilabel and multiclass targets, a mix of |
|
multilabel formats, for the presence of continuous-valued or multioutput |
|
targets, or for targets of different lengths. |
|
|
|
Column vectors are squeezed to 1d, while multilabel formats are returned |
|
as CSR sparse label indicators. |
|
|
|
Parameters |
|
---------- |
|
y_true : array-like |
|
|
|
y_pred : array-like |
|
|
|
Returns |
|
------- |
|
type_true : one of {'multilabel-indicator', 'multiclass', 'binary'} |
|
The type of the true target data, as output by |
|
``utils.multiclass.type_of_target``. |
|
|
|
y_true : array or indicator matrix |
|
|
|
y_pred : array or indicator matrix |
|
""" |
|
xp, _ = get_namespace(y_true, y_pred) |
|
check_consistent_length(y_true, y_pred) |
|
type_true = type_of_target(y_true, input_name="y_true") |
|
type_pred = type_of_target(y_pred, input_name="y_pred") |
|
|
|
y_type = {type_true, type_pred} |
|
if y_type == {"binary", "multiclass"}: |
|
y_type = {"multiclass"} |
|
|
|
if len(y_type) > 1: |
|
raise ValueError( |
|
"Classification metrics can't handle a mix of {0} and {1} targets".format( |
|
type_true, type_pred |
|
) |
|
) |
|
|
|
|
|
y_type = y_type.pop() |
|
|
|
|
|
if y_type not in ["binary", "multiclass", "multilabel-indicator"]: |
|
raise ValueError("{0} is not supported".format(y_type)) |
|
|
|
if y_type in ["binary", "multiclass"]: |
|
xp, _ = get_namespace(y_true, y_pred) |
|
y_true = column_or_1d(y_true) |
|
y_pred = column_or_1d(y_pred) |
|
if y_type == "binary": |
|
try: |
|
unique_values = _union1d(y_true, y_pred, xp) |
|
except TypeError as e: |
|
|
|
|
|
|
|
|
|
raise TypeError( |
|
"Labels in y_true and y_pred should be of the same type. " |
|
f"Got y_true={xp.unique(y_true)} and " |
|
f"y_pred={xp.unique(y_pred)}. Make sure that the " |
|
"predictions provided by the classifier coincides with " |
|
"the true labels." |
|
) from e |
|
if unique_values.shape[0] > 2: |
|
y_type = "multiclass" |
|
|
|
if y_type.startswith("multilabel"): |
|
if _is_numpy_namespace(xp): |
|
|
|
|
|
|
|
|
|
y_true = csr_matrix(y_true) |
|
y_pred = csr_matrix(y_pred) |
|
y_type = "multilabel-indicator" |
|
|
|
return y_type, y_true, y_pred |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like", "sparse matrix"], |
|
"y_pred": ["array-like", "sparse matrix"], |
|
"normalize": ["boolean"], |
|
"sample_weight": ["array-like", None], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None): |
|
"""Accuracy classification score. |
|
|
|
In multilabel classification, this function computes subset accuracy: |
|
the set of labels predicted for a sample must *exactly* match the |
|
corresponding set of labels in y_true. |
|
|
|
Read more in the :ref:`User Guide <accuracy_score>`. |
|
|
|
Parameters |
|
---------- |
|
y_true : 1d array-like, or label indicator array / sparse matrix |
|
Ground truth (correct) labels. |
|
|
|
y_pred : 1d array-like, or label indicator array / sparse matrix |
|
Predicted labels, as returned by a classifier. |
|
|
|
normalize : bool, default=True |
|
If ``False``, return the number of correctly classified samples. |
|
Otherwise, return the fraction of correctly classified samples. |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
Returns |
|
------- |
|
score : float or int |
|
If ``normalize == True``, return the fraction of correctly |
|
classified samples (float), else returns the number of correctly |
|
classified samples (int). |
|
|
|
The best performance is 1 with ``normalize == True`` and the number |
|
of samples with ``normalize == False``. |
|
|
|
See Also |
|
-------- |
|
balanced_accuracy_score : Compute the balanced accuracy to deal with |
|
imbalanced datasets. |
|
jaccard_score : Compute the Jaccard similarity coefficient score. |
|
hamming_loss : Compute the average Hamming loss or Hamming distance between |
|
two sets of samples. |
|
zero_one_loss : Compute the Zero-one classification loss. By default, the |
|
function will return the percentage of imperfectly predicted subsets. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics import accuracy_score |
|
>>> y_pred = [0, 2, 1, 3] |
|
>>> y_true = [0, 1, 2, 3] |
|
>>> accuracy_score(y_true, y_pred) |
|
0.5 |
|
>>> accuracy_score(y_true, y_pred, normalize=False) |
|
2.0 |
|
|
|
In the multilabel case with binary label indicators: |
|
|
|
>>> import numpy as np |
|
>>> accuracy_score(np.array([[0, 1], [1, 1]]), np.ones((2, 2))) |
|
0.5 |
|
""" |
|
xp, _, device = get_namespace_and_device(y_true, y_pred, sample_weight) |
|
|
|
y_true, y_pred = attach_unique(y_true, y_pred) |
|
y_type, y_true, y_pred = _check_targets(y_true, y_pred) |
|
check_consistent_length(y_true, y_pred, sample_weight) |
|
|
|
if y_type.startswith("multilabel"): |
|
if _is_numpy_namespace(xp): |
|
differing_labels = count_nonzero(y_true - y_pred, axis=1) |
|
else: |
|
differing_labels = _count_nonzero( |
|
y_true - y_pred, xp=xp, device=device, axis=1 |
|
) |
|
score = xp.asarray(differing_labels == 0, device=device) |
|
else: |
|
score = y_true == y_pred |
|
|
|
return float(_average(score, weights=sample_weight, normalize=normalize)) |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like"], |
|
"y_pred": ["array-like"], |
|
"labels": ["array-like", None], |
|
"sample_weight": ["array-like", None], |
|
"normalize": [StrOptions({"true", "pred", "all"}), None], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def confusion_matrix( |
|
y_true, y_pred, *, labels=None, sample_weight=None, normalize=None |
|
): |
|
"""Compute confusion matrix to evaluate the accuracy of a classification. |
|
|
|
By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}` |
|
is equal to the number of observations known to be in group :math:`i` and |
|
predicted to be in group :math:`j`. |
|
|
|
Thus in binary classification, the count of true negatives is |
|
:math:`C_{0,0}`, false negatives is :math:`C_{1,0}`, true positives is |
|
:math:`C_{1,1}` and false positives is :math:`C_{0,1}`. |
|
|
|
Read more in the :ref:`User Guide <confusion_matrix>`. |
|
|
|
Parameters |
|
---------- |
|
y_true : array-like of shape (n_samples,) |
|
Ground truth (correct) target values. |
|
|
|
y_pred : array-like of shape (n_samples,) |
|
Estimated targets as returned by a classifier. |
|
|
|
labels : array-like of shape (n_classes), default=None |
|
List of labels to index the matrix. This may be used to reorder |
|
or select a subset of labels. |
|
If ``None`` is given, those that appear at least once |
|
in ``y_true`` or ``y_pred`` are used in sorted order. |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
.. versionadded:: 0.18 |
|
|
|
normalize : {'true', 'pred', 'all'}, default=None |
|
Normalizes confusion matrix over the true (rows), predicted (columns) |
|
conditions or all the population. If None, confusion matrix will not be |
|
normalized. |
|
|
|
Returns |
|
------- |
|
C : ndarray of shape (n_classes, n_classes) |
|
Confusion matrix whose i-th row and j-th |
|
column entry indicates the number of |
|
samples with true label being i-th class |
|
and predicted label being j-th class. |
|
|
|
See Also |
|
-------- |
|
ConfusionMatrixDisplay.from_estimator : Plot the confusion matrix |
|
given an estimator, the data, and the label. |
|
ConfusionMatrixDisplay.from_predictions : Plot the confusion matrix |
|
given the true and predicted labels. |
|
ConfusionMatrixDisplay : Confusion Matrix visualization. |
|
|
|
References |
|
---------- |
|
.. [1] `Wikipedia entry for the Confusion matrix |
|
<https://en.wikipedia.org/wiki/Confusion_matrix>`_ |
|
(Wikipedia and other references may use a different |
|
convention for axes). |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics import confusion_matrix |
|
>>> y_true = [2, 0, 2, 2, 0, 1] |
|
>>> y_pred = [0, 0, 2, 2, 0, 2] |
|
>>> confusion_matrix(y_true, y_pred) |
|
array([[2, 0, 0], |
|
[0, 0, 1], |
|
[1, 0, 2]]) |
|
|
|
>>> y_true = ["cat", "ant", "cat", "cat", "ant", "bird"] |
|
>>> y_pred = ["ant", "ant", "cat", "cat", "ant", "cat"] |
|
>>> confusion_matrix(y_true, y_pred, labels=["ant", "bird", "cat"]) |
|
array([[2, 0, 0], |
|
[0, 0, 1], |
|
[1, 0, 2]]) |
|
|
|
In the binary case, we can extract true positives, etc. as follows: |
|
|
|
>>> tn, fp, fn, tp = confusion_matrix([0, 1, 0, 1], [1, 1, 1, 0]).ravel() |
|
>>> (tn, fp, fn, tp) |
|
(np.int64(0), np.int64(2), np.int64(1), np.int64(1)) |
|
""" |
|
y_true, y_pred = attach_unique(y_true, y_pred) |
|
y_type, y_true, y_pred = _check_targets(y_true, y_pred) |
|
if y_type not in ("binary", "multiclass"): |
|
raise ValueError("%s is not supported" % y_type) |
|
|
|
if labels is None: |
|
labels = unique_labels(y_true, y_pred) |
|
else: |
|
labels = np.asarray(labels) |
|
n_labels = labels.size |
|
if n_labels == 0: |
|
raise ValueError("'labels' should contains at least one label.") |
|
elif y_true.size == 0: |
|
return np.zeros((n_labels, n_labels), dtype=int) |
|
elif len(np.intersect1d(y_true, labels)) == 0: |
|
raise ValueError("At least one label specified must be in y_true") |
|
|
|
if sample_weight is None: |
|
sample_weight = np.ones(y_true.shape[0], dtype=np.int64) |
|
else: |
|
sample_weight = np.asarray(sample_weight) |
|
|
|
check_consistent_length(y_true, y_pred, sample_weight) |
|
|
|
n_labels = labels.size |
|
|
|
|
|
need_index_conversion = not ( |
|
labels.dtype.kind in {"i", "u", "b"} |
|
and np.all(labels == np.arange(n_labels)) |
|
and y_true.min() >= 0 |
|
and y_pred.min() >= 0 |
|
) |
|
if need_index_conversion: |
|
label_to_ind = {y: x for x, y in enumerate(labels)} |
|
y_pred = np.array([label_to_ind.get(x, n_labels + 1) for x in y_pred]) |
|
y_true = np.array([label_to_ind.get(x, n_labels + 1) for x in y_true]) |
|
|
|
|
|
ind = np.logical_and(y_pred < n_labels, y_true < n_labels) |
|
if not np.all(ind): |
|
y_pred = y_pred[ind] |
|
y_true = y_true[ind] |
|
|
|
sample_weight = sample_weight[ind] |
|
|
|
|
|
if sample_weight.dtype.kind in {"i", "u", "b"}: |
|
dtype = np.int64 |
|
else: |
|
dtype = np.float64 |
|
|
|
cm = coo_matrix( |
|
(sample_weight, (y_true, y_pred)), |
|
shape=(n_labels, n_labels), |
|
dtype=dtype, |
|
).toarray() |
|
|
|
with np.errstate(all="ignore"): |
|
if normalize == "true": |
|
cm = cm / cm.sum(axis=1, keepdims=True) |
|
elif normalize == "pred": |
|
cm = cm / cm.sum(axis=0, keepdims=True) |
|
elif normalize == "all": |
|
cm = cm / cm.sum() |
|
cm = np.nan_to_num(cm) |
|
|
|
if cm.shape == (1, 1): |
|
warnings.warn( |
|
( |
|
"A single label was found in 'y_true' and 'y_pred'. For the confusion " |
|
"matrix to have the correct shape, use the 'labels' parameter to pass " |
|
"all known labels." |
|
), |
|
UserWarning, |
|
) |
|
|
|
return cm |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like", "sparse matrix"], |
|
"y_pred": ["array-like", "sparse matrix"], |
|
"sample_weight": ["array-like", None], |
|
"labels": ["array-like", None], |
|
"samplewise": ["boolean"], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def multilabel_confusion_matrix( |
|
y_true, y_pred, *, sample_weight=None, labels=None, samplewise=False |
|
): |
|
"""Compute a confusion matrix for each class or sample. |
|
|
|
.. versionadded:: 0.21 |
|
|
|
Compute class-wise (default) or sample-wise (samplewise=True) multilabel |
|
confusion matrix to evaluate the accuracy of a classification, and output |
|
confusion matrices for each class or sample. |
|
|
|
In multilabel confusion matrix :math:`MCM`, the count of true negatives |
|
is :math:`MCM_{:,0,0}`, false negatives is :math:`MCM_{:,1,0}`, |
|
true positives is :math:`MCM_{:,1,1}` and false positives is |
|
:math:`MCM_{:,0,1}`. |
|
|
|
Multiclass data will be treated as if binarized under a one-vs-rest |
|
transformation. Returned confusion matrices will be in the order of |
|
sorted unique labels in the union of (y_true, y_pred). |
|
|
|
Read more in the :ref:`User Guide <multilabel_confusion_matrix>`. |
|
|
|
Parameters |
|
---------- |
|
y_true : {array-like, sparse matrix} of shape (n_samples, n_outputs) or \ |
|
(n_samples,) |
|
Ground truth (correct) target values. |
|
|
|
y_pred : {array-like, sparse matrix} of shape (n_samples, n_outputs) or \ |
|
(n_samples,) |
|
Estimated targets as returned by a classifier. |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
labels : array-like of shape (n_classes,), default=None |
|
A list of classes or column indices to select some (or to force |
|
inclusion of classes absent from the data). |
|
|
|
samplewise : bool, default=False |
|
In the multilabel case, this calculates a confusion matrix per sample. |
|
|
|
Returns |
|
------- |
|
multi_confusion : ndarray of shape (n_outputs, 2, 2) |
|
A 2x2 confusion matrix corresponding to each output in the input. |
|
When calculating class-wise multi_confusion (default), then |
|
n_outputs = n_labels; when calculating sample-wise multi_confusion |
|
(samplewise=True), n_outputs = n_samples. If ``labels`` is defined, |
|
the results will be returned in the order specified in ``labels``, |
|
otherwise the results will be returned in sorted order by default. |
|
|
|
See Also |
|
-------- |
|
confusion_matrix : Compute confusion matrix to evaluate the accuracy of a |
|
classifier. |
|
|
|
Notes |
|
----- |
|
The `multilabel_confusion_matrix` calculates class-wise or sample-wise |
|
multilabel confusion matrices, and in multiclass tasks, labels are |
|
binarized under a one-vs-rest way; while |
|
:func:`~sklearn.metrics.confusion_matrix` calculates one confusion matrix |
|
for confusion between every two classes. |
|
|
|
Examples |
|
-------- |
|
Multilabel-indicator case: |
|
|
|
>>> import numpy as np |
|
>>> from sklearn.metrics import multilabel_confusion_matrix |
|
>>> y_true = np.array([[1, 0, 1], |
|
... [0, 1, 0]]) |
|
>>> y_pred = np.array([[1, 0, 0], |
|
... [0, 1, 1]]) |
|
>>> multilabel_confusion_matrix(y_true, y_pred) |
|
array([[[1, 0], |
|
[0, 1]], |
|
<BLANKLINE> |
|
[[1, 0], |
|
[0, 1]], |
|
<BLANKLINE> |
|
[[0, 1], |
|
[1, 0]]]) |
|
|
|
Multiclass case: |
|
|
|
>>> y_true = ["cat", "ant", "cat", "cat", "ant", "bird"] |
|
>>> y_pred = ["ant", "ant", "cat", "cat", "ant", "cat"] |
|
>>> multilabel_confusion_matrix(y_true, y_pred, |
|
... labels=["ant", "bird", "cat"]) |
|
array([[[3, 1], |
|
[0, 2]], |
|
<BLANKLINE> |
|
[[5, 0], |
|
[1, 0]], |
|
<BLANKLINE> |
|
[[2, 1], |
|
[1, 2]]]) |
|
""" |
|
y_true, y_pred = attach_unique(y_true, y_pred) |
|
xp, _ = get_namespace(y_true, y_pred) |
|
device_ = device(y_true, y_pred) |
|
y_type, y_true, y_pred = _check_targets(y_true, y_pred) |
|
if sample_weight is not None: |
|
sample_weight = column_or_1d(sample_weight, device=device_) |
|
check_consistent_length(y_true, y_pred, sample_weight) |
|
|
|
if y_type not in ("binary", "multiclass", "multilabel-indicator"): |
|
raise ValueError("%s is not supported" % y_type) |
|
|
|
present_labels = unique_labels(y_true, y_pred) |
|
if labels is None: |
|
labels = present_labels |
|
n_labels = None |
|
else: |
|
labels = xp.asarray(labels, device=device_) |
|
n_labels = labels.shape[0] |
|
labels = xp.concat( |
|
[labels, _setdiff1d(present_labels, labels, assume_unique=True, xp=xp)], |
|
axis=-1, |
|
) |
|
|
|
if y_true.ndim == 1: |
|
if samplewise: |
|
raise ValueError( |
|
"Samplewise metrics are not available outside of " |
|
"multilabel classification." |
|
) |
|
|
|
le = LabelEncoder() |
|
le.fit(labels) |
|
y_true = le.transform(y_true) |
|
y_pred = le.transform(y_pred) |
|
sorted_labels = le.classes_ |
|
|
|
|
|
tp = y_true == y_pred |
|
tp_bins = y_true[tp] |
|
if sample_weight is not None: |
|
tp_bins_weights = sample_weight[tp] |
|
else: |
|
tp_bins_weights = None |
|
|
|
if tp_bins.shape[0]: |
|
tp_sum = _bincount( |
|
tp_bins, weights=tp_bins_weights, minlength=labels.shape[0], xp=xp |
|
) |
|
else: |
|
|
|
true_sum = pred_sum = tp_sum = xp.zeros(labels.shape[0]) |
|
if y_pred.shape[0]: |
|
pred_sum = _bincount( |
|
y_pred, weights=sample_weight, minlength=labels.shape[0], xp=xp |
|
) |
|
if y_true.shape[0]: |
|
true_sum = _bincount( |
|
y_true, weights=sample_weight, minlength=labels.shape[0], xp=xp |
|
) |
|
|
|
|
|
indices = _searchsorted(sorted_labels, labels[:n_labels], xp=xp) |
|
tp_sum = xp.take(tp_sum, indices, axis=0) |
|
true_sum = xp.take(true_sum, indices, axis=0) |
|
pred_sum = xp.take(pred_sum, indices, axis=0) |
|
|
|
else: |
|
sum_axis = 1 if samplewise else 0 |
|
|
|
|
|
|
|
if labels.shape != present_labels.shape or xp.any( |
|
xp.not_equal(labels, present_labels) |
|
): |
|
if xp.max(labels) > xp.max(present_labels): |
|
raise ValueError( |
|
"All labels must be in [0, n labels) for " |
|
"multilabel targets. " |
|
"Got %d > %d" % (xp.max(labels), xp.max(present_labels)) |
|
) |
|
if xp.min(labels) < 0: |
|
raise ValueError( |
|
"All labels must be in [0, n labels) for " |
|
"multilabel targets. " |
|
"Got %d < 0" % xp.min(labels) |
|
) |
|
|
|
if n_labels is not None: |
|
y_true = y_true[:, labels[:n_labels]] |
|
y_pred = y_pred[:, labels[:n_labels]] |
|
|
|
if issparse(y_true) or issparse(y_pred): |
|
true_and_pred = y_true.multiply(y_pred) |
|
else: |
|
true_and_pred = xp.multiply(y_true, y_pred) |
|
|
|
|
|
tp_sum = _count_nonzero( |
|
true_and_pred, |
|
axis=sum_axis, |
|
sample_weight=sample_weight, |
|
xp=xp, |
|
device=device_, |
|
) |
|
pred_sum = _count_nonzero( |
|
y_pred, |
|
axis=sum_axis, |
|
sample_weight=sample_weight, |
|
xp=xp, |
|
device=device_, |
|
) |
|
true_sum = _count_nonzero( |
|
y_true, |
|
axis=sum_axis, |
|
sample_weight=sample_weight, |
|
xp=xp, |
|
device=device_, |
|
) |
|
|
|
fp = pred_sum - tp_sum |
|
fn = true_sum - tp_sum |
|
tp = tp_sum |
|
|
|
if sample_weight is not None and samplewise: |
|
tp = xp.asarray(tp) |
|
fp = xp.asarray(fp) |
|
fn = xp.asarray(fn) |
|
tn = sample_weight * y_true.shape[1] - tp - fp - fn |
|
elif sample_weight is not None: |
|
tn = xp.sum(sample_weight) - tp - fp - fn |
|
elif samplewise: |
|
tn = y_true.shape[1] - tp - fp - fn |
|
else: |
|
tn = y_true.shape[0] - tp - fp - fn |
|
|
|
return xp.reshape(xp.stack([tn, fp, fn, tp]).T, (-1, 2, 2)) |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y1": ["array-like"], |
|
"y2": ["array-like"], |
|
"labels": ["array-like", None], |
|
"weights": [StrOptions({"linear", "quadratic"}), None], |
|
"sample_weight": ["array-like", None], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def cohen_kappa_score(y1, y2, *, labels=None, weights=None, sample_weight=None): |
|
r"""Compute Cohen's kappa: a statistic that measures inter-annotator agreement. |
|
|
|
This function computes Cohen's kappa [1]_, a score that expresses the level |
|
of agreement between two annotators on a classification problem. It is |
|
defined as |
|
|
|
.. math:: |
|
\kappa = (p_o - p_e) / (1 - p_e) |
|
|
|
where :math:`p_o` is the empirical probability of agreement on the label |
|
assigned to any sample (the observed agreement ratio), and :math:`p_e` is |
|
the expected agreement when both annotators assign labels randomly. |
|
:math:`p_e` is estimated using a per-annotator empirical prior over the |
|
class labels [2]_. |
|
|
|
Read more in the :ref:`User Guide <cohen_kappa>`. |
|
|
|
Parameters |
|
---------- |
|
y1 : array-like of shape (n_samples,) |
|
Labels assigned by the first annotator. |
|
|
|
y2 : array-like of shape (n_samples,) |
|
Labels assigned by the second annotator. The kappa statistic is |
|
symmetric, so swapping ``y1`` and ``y2`` doesn't change the value. |
|
|
|
labels : array-like of shape (n_classes,), default=None |
|
List of labels to index the matrix. This may be used to select a |
|
subset of labels. If `None`, all labels that appear at least once in |
|
``y1`` or ``y2`` are used. |
|
|
|
weights : {'linear', 'quadratic'}, default=None |
|
Weighting type to calculate the score. `None` means not weighted; |
|
"linear" means linear weighting; "quadratic" means quadratic weighting. |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
Returns |
|
------- |
|
kappa : float |
|
The kappa statistic, which is a number between -1 and 1. The maximum |
|
value means complete agreement; zero or lower means chance agreement. |
|
|
|
References |
|
---------- |
|
.. [1] :doi:`J. Cohen (1960). "A coefficient of agreement for nominal scales". |
|
Educational and Psychological Measurement 20(1):37-46. |
|
<10.1177/001316446002000104>` |
|
.. [2] `R. Artstein and M. Poesio (2008). "Inter-coder agreement for |
|
computational linguistics". Computational Linguistics 34(4):555-596 |
|
<https://www.mitpressjournals.org/doi/pdf/10.1162/coli.07-034-R2>`_. |
|
.. [3] `Wikipedia entry for the Cohen's kappa |
|
<https://en.wikipedia.org/wiki/Cohen%27s_kappa>`_. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics import cohen_kappa_score |
|
>>> y1 = ["negative", "positive", "negative", "neutral", "positive"] |
|
>>> y2 = ["negative", "positive", "negative", "neutral", "negative"] |
|
>>> cohen_kappa_score(y1, y2) |
|
np.float64(0.6875) |
|
""" |
|
confusion = confusion_matrix(y1, y2, labels=labels, sample_weight=sample_weight) |
|
n_classes = confusion.shape[0] |
|
sum0 = np.sum(confusion, axis=0) |
|
sum1 = np.sum(confusion, axis=1) |
|
expected = np.outer(sum0, sum1) / np.sum(sum0) |
|
|
|
if weights is None: |
|
w_mat = np.ones([n_classes, n_classes], dtype=int) |
|
w_mat.flat[:: n_classes + 1] = 0 |
|
else: |
|
w_mat = np.zeros([n_classes, n_classes], dtype=int) |
|
w_mat += np.arange(n_classes) |
|
if weights == "linear": |
|
w_mat = np.abs(w_mat - w_mat.T) |
|
else: |
|
w_mat = (w_mat - w_mat.T) ** 2 |
|
|
|
k = np.sum(w_mat * confusion) / np.sum(w_mat * expected) |
|
return 1 - k |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like", "sparse matrix"], |
|
"y_pred": ["array-like", "sparse matrix"], |
|
"labels": ["array-like", None], |
|
"pos_label": [Real, str, "boolean", None], |
|
"average": [ |
|
StrOptions({"micro", "macro", "samples", "weighted", "binary"}), |
|
None, |
|
], |
|
"sample_weight": ["array-like", None], |
|
"zero_division": [ |
|
Options(Real, {0, 1}), |
|
StrOptions({"warn"}), |
|
], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def jaccard_score( |
|
y_true, |
|
y_pred, |
|
*, |
|
labels=None, |
|
pos_label=1, |
|
average="binary", |
|
sample_weight=None, |
|
zero_division="warn", |
|
): |
|
"""Jaccard similarity coefficient score. |
|
|
|
The Jaccard index [1], or Jaccard similarity coefficient, defined as |
|
the size of the intersection divided by the size of the union of two label |
|
sets, is used to compare set of predicted labels for a sample to the |
|
corresponding set of labels in ``y_true``. |
|
|
|
Support beyond term:`binary` targets is achieved by treating :term:`multiclass` |
|
and :term:`multilabel` data as a collection of binary problems, one for each |
|
label. For the :term:`binary` case, setting `average='binary'` will return the |
|
Jaccard similarity coefficient for `pos_label`. If `average` is not `'binary'`, |
|
`pos_label` is ignored and scores for both classes are computed, then averaged or |
|
both returned (when `average=None`). Similarly, for :term:`multiclass` and |
|
:term:`multilabel` targets, scores for all `labels` are either returned or |
|
averaged depending on the `average` parameter. Use `labels` specify the set of |
|
labels to calculate the score for. |
|
|
|
Read more in the :ref:`User Guide <jaccard_similarity_score>`. |
|
|
|
Parameters |
|
---------- |
|
y_true : 1d array-like, or label indicator array / sparse matrix |
|
Ground truth (correct) labels. |
|
|
|
y_pred : 1d array-like, or label indicator array / sparse matrix |
|
Predicted labels, as returned by a classifier. |
|
|
|
labels : array-like of shape (n_classes,), default=None |
|
The set of labels to include when `average != 'binary'`, and their |
|
order if `average is None`. Labels present in the data can be |
|
excluded, for example in multiclass classification to exclude a "negative |
|
class". Labels not present in the data can be included and will be |
|
"assigned" 0 samples. For multilabel targets, labels are column indices. |
|
By default, all labels in `y_true` and `y_pred` are used in sorted order. |
|
|
|
pos_label : int, float, bool or str, default=1 |
|
The class to report if `average='binary'` and the data is binary, |
|
otherwise this parameter is ignored. |
|
For multiclass or multilabel targets, set `labels=[pos_label]` and |
|
`average != 'binary'` to report metrics for one label only. |
|
|
|
average : {'micro', 'macro', 'samples', 'weighted', \ |
|
'binary'} or None, default='binary' |
|
If ``None``, the scores for each class are returned. Otherwise, this |
|
determines the type of averaging performed on the data: |
|
|
|
``'binary'``: |
|
Only report results for the class specified by ``pos_label``. |
|
This is applicable only if targets (``y_{true,pred}``) are binary. |
|
``'micro'``: |
|
Calculate metrics globally by counting the total true positives, |
|
false negatives and false positives. |
|
``'macro'``: |
|
Calculate metrics for each label, and find their unweighted |
|
mean. This does not take label imbalance into account. |
|
``'weighted'``: |
|
Calculate metrics for each label, and find their average, weighted |
|
by support (the number of true instances for each label). This |
|
alters 'macro' to account for label imbalance. |
|
``'samples'``: |
|
Calculate metrics for each instance, and find their average (only |
|
meaningful for multilabel classification). |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
zero_division : "warn", {0.0, 1.0}, default="warn" |
|
Sets the value to return when there is a zero division, i.e. when there |
|
there are no negative values in predictions and labels. If set to |
|
"warn", this acts like 0, but a warning is also raised. |
|
|
|
.. versionadded:: 0.24 |
|
|
|
Returns |
|
------- |
|
score : float or ndarray of shape (n_unique_labels,), dtype=np.float64 |
|
The Jaccard score. When `average` is not `None`, a single scalar is |
|
returned. |
|
|
|
See Also |
|
-------- |
|
accuracy_score : Function for calculating the accuracy score. |
|
f1_score : Function for calculating the F1 score. |
|
multilabel_confusion_matrix : Function for computing a confusion matrix\ |
|
for each class or sample. |
|
|
|
Notes |
|
----- |
|
:func:`jaccard_score` may be a poor metric if there are no |
|
positives for some samples or classes. Jaccard is undefined if there are |
|
no true or predicted labels, and our implementation will return a score |
|
of 0 with a warning. |
|
|
|
References |
|
---------- |
|
.. [1] `Wikipedia entry for the Jaccard index |
|
<https://en.wikipedia.org/wiki/Jaccard_index>`_. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.metrics import jaccard_score |
|
>>> y_true = np.array([[0, 1, 1], |
|
... [1, 1, 0]]) |
|
>>> y_pred = np.array([[1, 1, 1], |
|
... [1, 0, 0]]) |
|
|
|
In the binary case: |
|
|
|
>>> jaccard_score(y_true[0], y_pred[0]) |
|
np.float64(0.6666...) |
|
|
|
In the 2D comparison case (e.g. image similarity): |
|
|
|
>>> jaccard_score(y_true, y_pred, average="micro") |
|
np.float64(0.6) |
|
|
|
In the multilabel case: |
|
|
|
>>> jaccard_score(y_true, y_pred, average='samples') |
|
np.float64(0.5833...) |
|
>>> jaccard_score(y_true, y_pred, average='macro') |
|
np.float64(0.6666...) |
|
>>> jaccard_score(y_true, y_pred, average=None) |
|
array([0.5, 0.5, 1. ]) |
|
|
|
In the multiclass case: |
|
|
|
>>> y_pred = [0, 2, 1, 2] |
|
>>> y_true = [0, 1, 2, 2] |
|
>>> jaccard_score(y_true, y_pred, average=None) |
|
array([1. , 0. , 0.33...]) |
|
""" |
|
labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_label) |
|
samplewise = average == "samples" |
|
MCM = multilabel_confusion_matrix( |
|
y_true, |
|
y_pred, |
|
sample_weight=sample_weight, |
|
labels=labels, |
|
samplewise=samplewise, |
|
) |
|
numerator = MCM[:, 1, 1] |
|
denominator = MCM[:, 1, 1] + MCM[:, 0, 1] + MCM[:, 1, 0] |
|
|
|
if average == "micro": |
|
numerator = np.array([numerator.sum()]) |
|
denominator = np.array([denominator.sum()]) |
|
|
|
jaccard = _prf_divide( |
|
numerator, |
|
denominator, |
|
"jaccard", |
|
"true or predicted", |
|
average, |
|
("jaccard",), |
|
zero_division=zero_division, |
|
) |
|
if average is None: |
|
return jaccard |
|
if average == "weighted": |
|
weights = MCM[:, 1, 0] + MCM[:, 1, 1] |
|
if not np.any(weights): |
|
|
|
weights = None |
|
elif average == "samples" and sample_weight is not None: |
|
weights = sample_weight |
|
else: |
|
weights = None |
|
return np.average(jaccard, weights=weights) |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like"], |
|
"y_pred": ["array-like"], |
|
"sample_weight": ["array-like", None], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def matthews_corrcoef(y_true, y_pred, *, sample_weight=None): |
|
"""Compute the Matthews correlation coefficient (MCC). |
|
|
|
The Matthews correlation coefficient is used in machine learning as a |
|
measure of the quality of binary and multiclass classifications. It takes |
|
into account true and false positives and negatives and is generally |
|
regarded as a balanced measure which can be used even if the classes are of |
|
very different sizes. The MCC is in essence a correlation coefficient value |
|
between -1 and +1. A coefficient of +1 represents a perfect prediction, 0 |
|
an average random prediction and -1 an inverse prediction. The statistic |
|
is also known as the phi coefficient. [source: Wikipedia] |
|
|
|
Binary and multiclass labels are supported. Only in the binary case does |
|
this relate to information about true and false positives and negatives. |
|
See references below. |
|
|
|
Read more in the :ref:`User Guide <matthews_corrcoef>`. |
|
|
|
Parameters |
|
---------- |
|
y_true : array-like of shape (n_samples,) |
|
Ground truth (correct) target values. |
|
|
|
y_pred : array-like of shape (n_samples,) |
|
Estimated targets as returned by a classifier. |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
.. versionadded:: 0.18 |
|
|
|
Returns |
|
------- |
|
mcc : float |
|
The Matthews correlation coefficient (+1 represents a perfect |
|
prediction, 0 an average random prediction and -1 and inverse |
|
prediction). |
|
|
|
References |
|
---------- |
|
.. [1] :doi:`Baldi, Brunak, Chauvin, Andersen and Nielsen, (2000). Assessing the |
|
accuracy of prediction algorithms for classification: an overview. |
|
<10.1093/bioinformatics/16.5.412>` |
|
|
|
.. [2] `Wikipedia entry for the Matthews Correlation Coefficient (phi coefficient) |
|
<https://en.wikipedia.org/wiki/Phi_coefficient>`_. |
|
|
|
.. [3] `Gorodkin, (2004). Comparing two K-category assignments by a |
|
K-category correlation coefficient |
|
<https://www.sciencedirect.com/science/article/pii/S1476927104000799>`_. |
|
|
|
.. [4] `Jurman, Riccadonna, Furlanello, (2012). A Comparison of MCC and CEN |
|
Error Measures in MultiClass Prediction |
|
<https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0041882>`_. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics import matthews_corrcoef |
|
>>> y_true = [+1, +1, +1, -1] |
|
>>> y_pred = [+1, -1, +1, +1] |
|
>>> matthews_corrcoef(y_true, y_pred) |
|
np.float64(-0.33...) |
|
""" |
|
y_true, y_pred = attach_unique(y_true, y_pred) |
|
y_type, y_true, y_pred = _check_targets(y_true, y_pred) |
|
check_consistent_length(y_true, y_pred, sample_weight) |
|
if y_type not in {"binary", "multiclass"}: |
|
raise ValueError("%s is not supported" % y_type) |
|
|
|
lb = LabelEncoder() |
|
lb.fit(np.hstack([y_true, y_pred])) |
|
y_true = lb.transform(y_true) |
|
y_pred = lb.transform(y_pred) |
|
|
|
C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight) |
|
t_sum = C.sum(axis=1, dtype=np.float64) |
|
p_sum = C.sum(axis=0, dtype=np.float64) |
|
n_correct = np.trace(C, dtype=np.float64) |
|
n_samples = p_sum.sum() |
|
cov_ytyp = n_correct * n_samples - np.dot(t_sum, p_sum) |
|
cov_ypyp = n_samples**2 - np.dot(p_sum, p_sum) |
|
cov_ytyt = n_samples**2 - np.dot(t_sum, t_sum) |
|
|
|
if cov_ypyp * cov_ytyt == 0: |
|
return 0.0 |
|
else: |
|
return cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp) |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like", "sparse matrix"], |
|
"y_pred": ["array-like", "sparse matrix"], |
|
"normalize": ["boolean"], |
|
"sample_weight": ["array-like", None], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def zero_one_loss(y_true, y_pred, *, normalize=True, sample_weight=None): |
|
"""Zero-one classification loss. |
|
|
|
If normalize is ``True``, return the fraction of misclassifications |
|
(float), else it returns the number of misclassifications (int). The best |
|
performance is 0. |
|
|
|
Read more in the :ref:`User Guide <zero_one_loss>`. |
|
|
|
Parameters |
|
---------- |
|
y_true : 1d array-like, or label indicator array / sparse matrix |
|
Ground truth (correct) labels. |
|
|
|
y_pred : 1d array-like, or label indicator array / sparse matrix |
|
Predicted labels, as returned by a classifier. |
|
|
|
normalize : bool, default=True |
|
If ``False``, return the number of misclassifications. |
|
Otherwise, return the fraction of misclassifications. |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
Returns |
|
------- |
|
loss : float or int, |
|
If ``normalize == True``, return the fraction of misclassifications |
|
(float), else it returns the number of misclassifications (int). |
|
|
|
See Also |
|
-------- |
|
accuracy_score : Compute the accuracy score. By default, the function will |
|
return the fraction of correct predictions divided by the total number |
|
of predictions. |
|
hamming_loss : Compute the average Hamming loss or Hamming distance between |
|
two sets of samples. |
|
jaccard_score : Compute the Jaccard similarity coefficient score. |
|
|
|
Notes |
|
----- |
|
In multilabel classification, the zero_one_loss function corresponds to |
|
the subset zero-one loss: for each sample, the entire set of labels must be |
|
correctly predicted, otherwise the loss for that sample is equal to one. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics import zero_one_loss |
|
>>> y_pred = [1, 2, 3, 4] |
|
>>> y_true = [2, 2, 3, 4] |
|
>>> zero_one_loss(y_true, y_pred) |
|
0.25 |
|
>>> zero_one_loss(y_true, y_pred, normalize=False) |
|
1.0 |
|
|
|
In the multilabel case with binary label indicators: |
|
|
|
>>> import numpy as np |
|
>>> zero_one_loss(np.array([[0, 1], [1, 1]]), np.ones((2, 2))) |
|
0.5 |
|
""" |
|
xp, _ = get_namespace(y_true, y_pred) |
|
score = accuracy_score( |
|
y_true, y_pred, normalize=normalize, sample_weight=sample_weight |
|
) |
|
|
|
if normalize: |
|
return 1 - score |
|
else: |
|
if sample_weight is not None: |
|
n_samples = xp.sum(sample_weight) |
|
else: |
|
n_samples = _num_samples(y_true) |
|
return n_samples - score |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like", "sparse matrix"], |
|
"y_pred": ["array-like", "sparse matrix"], |
|
"labels": ["array-like", None], |
|
"pos_label": [Real, str, "boolean", None], |
|
"average": [ |
|
StrOptions({"micro", "macro", "samples", "weighted", "binary"}), |
|
None, |
|
], |
|
"sample_weight": ["array-like", None], |
|
"zero_division": [ |
|
Options(Real, {0.0, 1.0}), |
|
"nan", |
|
StrOptions({"warn"}), |
|
], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def f1_score( |
|
y_true, |
|
y_pred, |
|
*, |
|
labels=None, |
|
pos_label=1, |
|
average="binary", |
|
sample_weight=None, |
|
zero_division="warn", |
|
): |
|
"""Compute the F1 score, also known as balanced F-score or F-measure. |
|
|
|
The F1 score can be interpreted as a harmonic mean of the precision and |
|
recall, where an F1 score reaches its best value at 1 and worst score at 0. |
|
The relative contribution of precision and recall to the F1 score are |
|
equal. The formula for the F1 score is: |
|
|
|
.. math:: |
|
\\text{F1} = \\frac{2 * \\text{TP}}{2 * \\text{TP} + \\text{FP} + \\text{FN}} |
|
|
|
Where :math:`\\text{TP}` is the number of true positives, :math:`\\text{FN}` is the |
|
number of false negatives, and :math:`\\text{FP}` is the number of false positives. |
|
F1 is by default |
|
calculated as 0.0 when there are no true positives, false negatives, or |
|
false positives. |
|
|
|
Support beyond :term:`binary` targets is achieved by treating :term:`multiclass` |
|
and :term:`multilabel` data as a collection of binary problems, one for each |
|
label. For the :term:`binary` case, setting `average='binary'` will return |
|
F1 score for `pos_label`. If `average` is not `'binary'`, `pos_label` is ignored |
|
and F1 score for both classes are computed, then averaged or both returned (when |
|
`average=None`). Similarly, for :term:`multiclass` and :term:`multilabel` targets, |
|
F1 score for all `labels` are either returned or averaged depending on the |
|
`average` parameter. Use `labels` specify the set of labels to calculate F1 score |
|
for. |
|
|
|
Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`. |
|
|
|
Parameters |
|
---------- |
|
y_true : 1d array-like, or label indicator array / sparse matrix |
|
Ground truth (correct) target values. |
|
|
|
y_pred : 1d array-like, or label indicator array / sparse matrix |
|
Estimated targets as returned by a classifier. |
|
|
|
labels : array-like, default=None |
|
The set of labels to include when `average != 'binary'`, and their |
|
order if `average is None`. Labels present in the data can be |
|
excluded, for example in multiclass classification to exclude a "negative |
|
class". Labels not present in the data can be included and will be |
|
"assigned" 0 samples. For multilabel targets, labels are column indices. |
|
By default, all labels in `y_true` and `y_pred` are used in sorted order. |
|
|
|
.. versionchanged:: 0.17 |
|
Parameter `labels` improved for multiclass problem. |
|
|
|
pos_label : int, float, bool or str, default=1 |
|
The class to report if `average='binary'` and the data is binary, |
|
otherwise this parameter is ignored. |
|
For multiclass or multilabel targets, set `labels=[pos_label]` and |
|
`average != 'binary'` to report metrics for one label only. |
|
|
|
average : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, \ |
|
default='binary' |
|
This parameter is required for multiclass/multilabel targets. |
|
If ``None``, the metrics for each class are returned. Otherwise, this |
|
determines the type of averaging performed on the data: |
|
|
|
``'binary'``: |
|
Only report results for the class specified by ``pos_label``. |
|
This is applicable only if targets (``y_{true,pred}``) are binary. |
|
``'micro'``: |
|
Calculate metrics globally by counting the total true positives, |
|
false negatives and false positives. |
|
``'macro'``: |
|
Calculate metrics for each label, and find their unweighted |
|
mean. This does not take label imbalance into account. |
|
``'weighted'``: |
|
Calculate metrics for each label, and find their average weighted |
|
by support (the number of true instances for each label). This |
|
alters 'macro' to account for label imbalance; it can result in an |
|
F-score that is not between precision and recall. |
|
``'samples'``: |
|
Calculate metrics for each instance, and find their average (only |
|
meaningful for multilabel classification where this differs from |
|
:func:`accuracy_score`). |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
zero_division : {"warn", 0.0, 1.0, np.nan}, default="warn" |
|
Sets the value to return when there is a zero division, i.e. when all |
|
predictions and labels are negative. |
|
|
|
Notes: |
|
- If set to "warn", this acts like 0, but a warning is also raised. |
|
- If set to `np.nan`, such values will be excluded from the average. |
|
|
|
.. versionadded:: 1.3 |
|
`np.nan` option was added. |
|
|
|
Returns |
|
------- |
|
f1_score : float or array of float, shape = [n_unique_labels] |
|
F1 score of the positive class in binary classification or weighted |
|
average of the F1 scores of each class for the multiclass task. |
|
|
|
See Also |
|
-------- |
|
fbeta_score : Compute the F-beta score. |
|
precision_recall_fscore_support : Compute the precision, recall, F-score, |
|
and support. |
|
jaccard_score : Compute the Jaccard similarity coefficient score. |
|
multilabel_confusion_matrix : Compute a confusion matrix for each class or |
|
sample. |
|
|
|
Notes |
|
----- |
|
When ``true positive + false positive + false negative == 0`` (i.e. a class |
|
is completely absent from both ``y_true`` or ``y_pred``), f-score is |
|
undefined. In such cases, by default f-score will be set to 0.0, and |
|
``UndefinedMetricWarning`` will be raised. This behavior can be modified by |
|
setting the ``zero_division`` parameter. |
|
|
|
References |
|
---------- |
|
.. [1] `Wikipedia entry for the F1-score |
|
<https://en.wikipedia.org/wiki/F1_score>`_. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.metrics import f1_score |
|
>>> y_true = [0, 1, 2, 0, 1, 2] |
|
>>> y_pred = [0, 2, 1, 0, 0, 1] |
|
>>> f1_score(y_true, y_pred, average='macro') |
|
0.26... |
|
>>> f1_score(y_true, y_pred, average='micro') |
|
0.33... |
|
>>> f1_score(y_true, y_pred, average='weighted') |
|
0.26... |
|
>>> f1_score(y_true, y_pred, average=None) |
|
array([0.8, 0. , 0. ]) |
|
|
|
>>> # binary classification |
|
>>> y_true_empty = [0, 0, 0, 0, 0, 0] |
|
>>> y_pred_empty = [0, 0, 0, 0, 0, 0] |
|
>>> f1_score(y_true_empty, y_pred_empty) |
|
0.0... |
|
>>> f1_score(y_true_empty, y_pred_empty, zero_division=1.0) |
|
1.0... |
|
>>> f1_score(y_true_empty, y_pred_empty, zero_division=np.nan) |
|
nan... |
|
|
|
>>> # multilabel classification |
|
>>> y_true = [[0, 0, 0], [1, 1, 1], [0, 1, 1]] |
|
>>> y_pred = [[0, 0, 0], [1, 1, 1], [1, 1, 0]] |
|
>>> f1_score(y_true, y_pred, average=None) |
|
array([0.66666667, 1. , 0.66666667]) |
|
""" |
|
return fbeta_score( |
|
y_true, |
|
y_pred, |
|
beta=1, |
|
labels=labels, |
|
pos_label=pos_label, |
|
average=average, |
|
sample_weight=sample_weight, |
|
zero_division=zero_division, |
|
) |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like", "sparse matrix"], |
|
"y_pred": ["array-like", "sparse matrix"], |
|
"beta": [Interval(Real, 0.0, None, closed="both")], |
|
"labels": ["array-like", None], |
|
"pos_label": [Real, str, "boolean", None], |
|
"average": [ |
|
StrOptions({"micro", "macro", "samples", "weighted", "binary"}), |
|
None, |
|
], |
|
"sample_weight": ["array-like", None], |
|
"zero_division": [ |
|
Options(Real, {0.0, 1.0}), |
|
"nan", |
|
StrOptions({"warn"}), |
|
], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def fbeta_score( |
|
y_true, |
|
y_pred, |
|
*, |
|
beta, |
|
labels=None, |
|
pos_label=1, |
|
average="binary", |
|
sample_weight=None, |
|
zero_division="warn", |
|
): |
|
"""Compute the F-beta score. |
|
|
|
The F-beta score is the weighted harmonic mean of precision and recall, |
|
reaching its optimal value at 1 and its worst value at 0. |
|
|
|
The `beta` parameter represents the ratio of recall importance to |
|
precision importance. `beta > 1` gives more weight to recall, while |
|
`beta < 1` favors precision. For example, `beta = 2` makes recall twice |
|
as important as precision, while `beta = 0.5` does the opposite. |
|
Asymptotically, `beta -> +inf` considers only recall, and `beta -> 0` |
|
only precision. |
|
|
|
The formula for F-beta score is: |
|
|
|
.. math:: |
|
|
|
F_\\beta = \\frac{(1 + \\beta^2) \\text{tp}} |
|
{(1 + \\beta^2) \\text{tp} + \\text{fp} + \\beta^2 \\text{fn}} |
|
|
|
Where :math:`\\text{tp}` is the number of true positives, :math:`\\text{fp}` is the |
|
number of false positives, and :math:`\\text{fn}` is the number of false negatives. |
|
|
|
Support beyond term:`binary` targets is achieved by treating :term:`multiclass` |
|
and :term:`multilabel` data as a collection of binary problems, one for each |
|
label. For the :term:`binary` case, setting `average='binary'` will return |
|
F-beta score for `pos_label`. If `average` is not `'binary'`, `pos_label` is |
|
ignored and F-beta score for both classes are computed, then averaged or both |
|
returned (when `average=None`). Similarly, for :term:`multiclass` and |
|
:term:`multilabel` targets, F-beta score for all `labels` are either returned or |
|
averaged depending on the `average` parameter. Use `labels` specify the set of |
|
labels to calculate F-beta score for. |
|
|
|
Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`. |
|
|
|
Parameters |
|
---------- |
|
y_true : 1d array-like, or label indicator array / sparse matrix |
|
Ground truth (correct) target values. |
|
|
|
y_pred : 1d array-like, or label indicator array / sparse matrix |
|
Estimated targets as returned by a classifier. |
|
|
|
beta : float |
|
Determines the weight of recall in the combined score. |
|
|
|
labels : array-like, default=None |
|
The set of labels to include when `average != 'binary'`, and their |
|
order if `average is None`. Labels present in the data can be |
|
excluded, for example in multiclass classification to exclude a "negative |
|
class". Labels not present in the data can be included and will be |
|
"assigned" 0 samples. For multilabel targets, labels are column indices. |
|
By default, all labels in `y_true` and `y_pred` are used in sorted order. |
|
|
|
.. versionchanged:: 0.17 |
|
Parameter `labels` improved for multiclass problem. |
|
|
|
pos_label : int, float, bool or str, default=1 |
|
The class to report if `average='binary'` and the data is binary, |
|
otherwise this parameter is ignored. |
|
For multiclass or multilabel targets, set `labels=[pos_label]` and |
|
`average != 'binary'` to report metrics for one label only. |
|
|
|
average : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, \ |
|
default='binary' |
|
This parameter is required for multiclass/multilabel targets. |
|
If ``None``, the metrics for each class are returned. Otherwise, this |
|
determines the type of averaging performed on the data: |
|
|
|
``'binary'``: |
|
Only report results for the class specified by ``pos_label``. |
|
This is applicable only if targets (``y_{true,pred}``) are binary. |
|
``'micro'``: |
|
Calculate metrics globally by counting the total true positives, |
|
false negatives and false positives. |
|
``'macro'``: |
|
Calculate metrics for each label, and find their unweighted |
|
mean. This does not take label imbalance into account. |
|
``'weighted'``: |
|
Calculate metrics for each label, and find their average weighted |
|
by support (the number of true instances for each label). This |
|
alters 'macro' to account for label imbalance; it can result in an |
|
F-score that is not between precision and recall. |
|
``'samples'``: |
|
Calculate metrics for each instance, and find their average (only |
|
meaningful for multilabel classification where this differs from |
|
:func:`accuracy_score`). |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
zero_division : {"warn", 0.0, 1.0, np.nan}, default="warn" |
|
Sets the value to return when there is a zero division, i.e. when all |
|
predictions and labels are negative. |
|
|
|
Notes: |
|
|
|
- If set to "warn", this acts like 0, but a warning is also raised. |
|
- If set to `np.nan`, such values will be excluded from the average. |
|
|
|
.. versionadded:: 1.3 |
|
`np.nan` option was added. |
|
|
|
Returns |
|
------- |
|
fbeta_score : float (if average is not None) or array of float, shape =\ |
|
[n_unique_labels] |
|
F-beta score of the positive class in binary classification or weighted |
|
average of the F-beta score of each class for the multiclass task. |
|
|
|
See Also |
|
-------- |
|
precision_recall_fscore_support : Compute the precision, recall, F-score, |
|
and support. |
|
multilabel_confusion_matrix : Compute a confusion matrix for each class or |
|
sample. |
|
|
|
Notes |
|
----- |
|
When ``true positive + false positive + false negative == 0``, f-score |
|
returns 0.0 and raises ``UndefinedMetricWarning``. This behavior can be |
|
modified by setting ``zero_division``. |
|
|
|
References |
|
---------- |
|
.. [1] R. Baeza-Yates and B. Ribeiro-Neto (2011). |
|
Modern Information Retrieval. Addison Wesley, pp. 327-328. |
|
|
|
.. [2] `Wikipedia entry for the F1-score |
|
<https://en.wikipedia.org/wiki/F1_score>`_. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.metrics import fbeta_score |
|
>>> y_true = [0, 1, 2, 0, 1, 2] |
|
>>> y_pred = [0, 2, 1, 0, 0, 1] |
|
>>> fbeta_score(y_true, y_pred, average='macro', beta=0.5) |
|
0.23... |
|
>>> fbeta_score(y_true, y_pred, average='micro', beta=0.5) |
|
0.33... |
|
>>> fbeta_score(y_true, y_pred, average='weighted', beta=0.5) |
|
0.23... |
|
>>> fbeta_score(y_true, y_pred, average=None, beta=0.5) |
|
array([0.71..., 0. , 0. ]) |
|
>>> y_pred_empty = [0, 0, 0, 0, 0, 0] |
|
>>> fbeta_score(y_true, y_pred_empty, |
|
... average="macro", zero_division=np.nan, beta=0.5) |
|
0.12... |
|
""" |
|
|
|
_, _, f, _ = precision_recall_fscore_support( |
|
y_true, |
|
y_pred, |
|
beta=beta, |
|
labels=labels, |
|
pos_label=pos_label, |
|
average=average, |
|
warn_for=("f-score",), |
|
sample_weight=sample_weight, |
|
zero_division=zero_division, |
|
) |
|
return f |
|
|
|
|
|
def _prf_divide( |
|
numerator, denominator, metric, modifier, average, warn_for, zero_division="warn" |
|
): |
|
"""Performs division and handles divide-by-zero. |
|
|
|
On zero-division, sets the corresponding result elements equal to |
|
0, 1 or np.nan (according to ``zero_division``). Plus, if |
|
``zero_division != "warn"`` raises a warning. |
|
|
|
The metric, modifier and average arguments are used only for determining |
|
an appropriate warning. |
|
""" |
|
xp, _ = get_namespace(numerator, denominator) |
|
dtype_float = _find_matching_floating_dtype(numerator, denominator, xp=xp) |
|
mask = denominator == 0 |
|
denominator = xp.asarray(denominator, copy=True, dtype=dtype_float) |
|
denominator[mask] = 1 |
|
result = xp.asarray(numerator, dtype=dtype_float) / denominator |
|
|
|
if not xp.any(mask): |
|
return result |
|
|
|
|
|
zero_division_value = _check_zero_division(zero_division) |
|
result[mask] = zero_division_value |
|
|
|
|
|
|
|
|
|
if zero_division != "warn" or metric not in warn_for: |
|
return result |
|
|
|
|
|
if metric in warn_for: |
|
_warn_prf(average, modifier, f"{metric.capitalize()} is", len(result)) |
|
|
|
return result |
|
|
|
|
|
def _warn_prf(average, modifier, msg_start, result_size): |
|
axis0, axis1 = "sample", "label" |
|
if average == "samples": |
|
axis0, axis1 = axis1, axis0 |
|
msg = ( |
|
"{0} ill-defined and being set to 0.0 {{0}} " |
|
"no {1} {2}s. Use `zero_division` parameter to control" |
|
" this behavior.".format(msg_start, modifier, axis0) |
|
) |
|
if result_size == 1: |
|
msg = msg.format("due to") |
|
else: |
|
msg = msg.format("in {0}s with".format(axis1)) |
|
warnings.warn(msg, UndefinedMetricWarning, stacklevel=2) |
|
|
|
|
|
def _check_set_wise_labels(y_true, y_pred, average, labels, pos_label): |
|
"""Validation associated with set-wise metrics. |
|
|
|
Returns identified labels. |
|
""" |
|
average_options = (None, "micro", "macro", "weighted", "samples") |
|
if average not in average_options and average != "binary": |
|
raise ValueError("average has to be one of " + str(average_options)) |
|
|
|
y_true, y_pred = attach_unique(y_true, y_pred) |
|
y_type, y_true, y_pred = _check_targets(y_true, y_pred) |
|
|
|
|
|
present_labels = _tolist(unique_labels(y_true, y_pred)) |
|
if average == "binary": |
|
if y_type == "binary": |
|
if pos_label not in present_labels: |
|
if len(present_labels) >= 2: |
|
raise ValueError( |
|
f"pos_label={pos_label} is not a valid label. It " |
|
f"should be one of {present_labels}" |
|
) |
|
labels = [pos_label] |
|
else: |
|
average_options = list(average_options) |
|
if y_type == "multiclass": |
|
average_options.remove("samples") |
|
raise ValueError( |
|
"Target is %s but average='binary'. Please " |
|
"choose another average setting, one of %r." % (y_type, average_options) |
|
) |
|
elif pos_label not in (None, 1): |
|
warnings.warn( |
|
"Note that pos_label (set to %r) is ignored when " |
|
"average != 'binary' (got %r). You may use " |
|
"labels=[pos_label] to specify a single positive class." |
|
% (pos_label, average), |
|
UserWarning, |
|
) |
|
return labels |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like", "sparse matrix"], |
|
"y_pred": ["array-like", "sparse matrix"], |
|
"beta": [Interval(Real, 0.0, None, closed="both")], |
|
"labels": ["array-like", None], |
|
"pos_label": [Real, str, "boolean", None], |
|
"average": [ |
|
StrOptions({"micro", "macro", "samples", "weighted", "binary"}), |
|
None, |
|
], |
|
"warn_for": [list, tuple, set], |
|
"sample_weight": ["array-like", None], |
|
"zero_division": [ |
|
Options(Real, {0.0, 1.0}), |
|
"nan", |
|
StrOptions({"warn"}), |
|
], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def precision_recall_fscore_support( |
|
y_true, |
|
y_pred, |
|
*, |
|
beta=1.0, |
|
labels=None, |
|
pos_label=1, |
|
average=None, |
|
warn_for=("precision", "recall", "f-score"), |
|
sample_weight=None, |
|
zero_division="warn", |
|
): |
|
"""Compute precision, recall, F-measure and support for each class. |
|
|
|
The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of |
|
true positives and ``fp`` the number of false positives. The precision is |
|
intuitively the ability of the classifier not to label a negative sample as |
|
positive. |
|
|
|
The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of |
|
true positives and ``fn`` the number of false negatives. The recall is |
|
intuitively the ability of the classifier to find all the positive samples. |
|
|
|
The F-beta score can be interpreted as a weighted harmonic mean of |
|
the precision and recall, where an F-beta score reaches its best |
|
value at 1 and worst score at 0. |
|
|
|
The F-beta score weights recall more than precision by a factor of |
|
``beta``. ``beta == 1.0`` means recall and precision are equally important. |
|
|
|
The support is the number of occurrences of each class in ``y_true``. |
|
|
|
Support beyond term:`binary` targets is achieved by treating :term:`multiclass` |
|
and :term:`multilabel` data as a collection of binary problems, one for each |
|
label. For the :term:`binary` case, setting `average='binary'` will return |
|
metrics for `pos_label`. If `average` is not `'binary'`, `pos_label` is ignored |
|
and metrics for both classes are computed, then averaged or both returned (when |
|
`average=None`). Similarly, for :term:`multiclass` and :term:`multilabel` targets, |
|
metrics for all `labels` are either returned or averaged depending on the `average` |
|
parameter. Use `labels` specify the set of labels to calculate metrics for. |
|
|
|
Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`. |
|
|
|
Parameters |
|
---------- |
|
y_true : 1d array-like, or label indicator array / sparse matrix |
|
Ground truth (correct) target values. |
|
|
|
y_pred : 1d array-like, or label indicator array / sparse matrix |
|
Estimated targets as returned by a classifier. |
|
|
|
beta : float, default=1.0 |
|
The strength of recall versus precision in the F-score. |
|
|
|
labels : array-like, default=None |
|
The set of labels to include when `average != 'binary'`, and their |
|
order if `average is None`. Labels present in the data can be |
|
excluded, for example in multiclass classification to exclude a "negative |
|
class". Labels not present in the data can be included and will be |
|
"assigned" 0 samples. For multilabel targets, labels are column indices. |
|
By default, all labels in `y_true` and `y_pred` are used in sorted order. |
|
|
|
.. versionchanged:: 0.17 |
|
Parameter `labels` improved for multiclass problem. |
|
|
|
pos_label : int, float, bool or str, default=1 |
|
The class to report if `average='binary'` and the data is binary, |
|
otherwise this parameter is ignored. |
|
For multiclass or multilabel targets, set `labels=[pos_label]` and |
|
`average != 'binary'` to report metrics for one label only. |
|
|
|
average : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, \ |
|
default='binary' |
|
This parameter is required for multiclass/multilabel targets. |
|
If ``None``, the metrics for each class are returned. Otherwise, this |
|
determines the type of averaging performed on the data: |
|
|
|
``'binary'``: |
|
Only report results for the class specified by ``pos_label``. |
|
This is applicable only if targets (``y_{true,pred}``) are binary. |
|
``'micro'``: |
|
Calculate metrics globally by counting the total true positives, |
|
false negatives and false positives. |
|
``'macro'``: |
|
Calculate metrics for each label, and find their unweighted |
|
mean. This does not take label imbalance into account. |
|
``'weighted'``: |
|
Calculate metrics for each label, and find their average weighted |
|
by support (the number of true instances for each label). This |
|
alters 'macro' to account for label imbalance; it can result in an |
|
F-score that is not between precision and recall. |
|
``'samples'``: |
|
Calculate metrics for each instance, and find their average (only |
|
meaningful for multilabel classification where this differs from |
|
:func:`accuracy_score`). |
|
|
|
warn_for : list, tuple or set, for internal use |
|
This determines which warnings will be made in the case that this |
|
function is being used to return only one of its metrics. |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
zero_division : {"warn", 0.0, 1.0, np.nan}, default="warn" |
|
Sets the value to return when there is a zero division: |
|
|
|
- recall: when there are no positive labels |
|
- precision: when there are no positive predictions |
|
- f-score: both |
|
|
|
Notes: |
|
|
|
- If set to "warn", this acts like 0, but a warning is also raised. |
|
- If set to `np.nan`, such values will be excluded from the average. |
|
|
|
.. versionadded:: 1.3 |
|
`np.nan` option was added. |
|
|
|
Returns |
|
------- |
|
precision : float (if average is not None) or array of float, shape =\ |
|
[n_unique_labels] |
|
Precision score. |
|
|
|
recall : float (if average is not None) or array of float, shape =\ |
|
[n_unique_labels] |
|
Recall score. |
|
|
|
fbeta_score : float (if average is not None) or array of float, shape =\ |
|
[n_unique_labels] |
|
F-beta score. |
|
|
|
support : None (if average is not None) or array of int, shape =\ |
|
[n_unique_labels] |
|
The number of occurrences of each label in ``y_true``. |
|
|
|
Notes |
|
----- |
|
When ``true positive + false positive == 0``, precision is undefined. |
|
When ``true positive + false negative == 0``, recall is undefined. When |
|
``true positive + false negative + false positive == 0``, f-score is |
|
undefined. In such cases, by default the metric will be set to 0, and |
|
``UndefinedMetricWarning`` will be raised. This behavior can be modified |
|
with ``zero_division``. |
|
|
|
References |
|
---------- |
|
.. [1] `Wikipedia entry for the Precision and recall |
|
<https://en.wikipedia.org/wiki/Precision_and_recall>`_. |
|
|
|
.. [2] `Wikipedia entry for the F1-score |
|
<https://en.wikipedia.org/wiki/F1_score>`_. |
|
|
|
.. [3] `Discriminative Methods for Multi-labeled Classification Advances |
|
in Knowledge Discovery and Data Mining (2004), pp. 22-30 by Shantanu |
|
Godbole, Sunita Sarawagi |
|
<http://www.godbole.net/shantanu/pubs/multilabelsvm-pakdd04.pdf>`_. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.metrics import precision_recall_fscore_support |
|
>>> y_true = np.array(['cat', 'dog', 'pig', 'cat', 'dog', 'pig']) |
|
>>> y_pred = np.array(['cat', 'pig', 'dog', 'cat', 'cat', 'dog']) |
|
>>> precision_recall_fscore_support(y_true, y_pred, average='macro') |
|
(0.22..., 0.33..., 0.26..., None) |
|
>>> precision_recall_fscore_support(y_true, y_pred, average='micro') |
|
(0.33..., 0.33..., 0.33..., None) |
|
>>> precision_recall_fscore_support(y_true, y_pred, average='weighted') |
|
(0.22..., 0.33..., 0.26..., None) |
|
|
|
It is possible to compute per-label precisions, recalls, F1-scores and |
|
supports instead of averaging: |
|
|
|
>>> precision_recall_fscore_support(y_true, y_pred, average=None, |
|
... labels=['pig', 'dog', 'cat']) |
|
(array([0. , 0. , 0.66...]), |
|
array([0., 0., 1.]), array([0. , 0. , 0.8]), |
|
array([2, 2, 2])) |
|
""" |
|
_check_zero_division(zero_division) |
|
labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_label) |
|
|
|
|
|
samplewise = average == "samples" |
|
MCM = multilabel_confusion_matrix( |
|
y_true, |
|
y_pred, |
|
sample_weight=sample_weight, |
|
labels=labels, |
|
samplewise=samplewise, |
|
) |
|
tp_sum = MCM[:, 1, 1] |
|
pred_sum = tp_sum + MCM[:, 0, 1] |
|
true_sum = tp_sum + MCM[:, 1, 0] |
|
|
|
xp, _ = get_namespace(y_true, y_pred) |
|
if average == "micro": |
|
tp_sum = xp.reshape(xp.sum(tp_sum), (1,)) |
|
pred_sum = xp.reshape(xp.sum(pred_sum), (1,)) |
|
true_sum = xp.reshape(xp.sum(true_sum), (1,)) |
|
|
|
|
|
beta2 = beta**2 |
|
|
|
|
|
|
|
precision = _prf_divide( |
|
tp_sum, pred_sum, "precision", "predicted", average, warn_for, zero_division |
|
) |
|
recall = _prf_divide( |
|
tp_sum, true_sum, "recall", "true", average, warn_for, zero_division |
|
) |
|
|
|
if np.isposinf(beta): |
|
f_score = recall |
|
elif beta == 0: |
|
f_score = precision |
|
else: |
|
|
|
|
|
|
|
|
|
denom = beta2 * true_sum + pred_sum |
|
f_score = _prf_divide( |
|
(1 + beta2) * tp_sum, |
|
denom, |
|
"f-score", |
|
"true nor predicted", |
|
average, |
|
warn_for, |
|
zero_division, |
|
) |
|
|
|
|
|
if average == "weighted": |
|
weights = true_sum |
|
elif average == "samples": |
|
weights = sample_weight |
|
else: |
|
weights = None |
|
|
|
if average is not None: |
|
assert average != "binary" or precision.shape[0] == 1 |
|
precision = float(_nanaverage(precision, weights=weights)) |
|
recall = float(_nanaverage(recall, weights=weights)) |
|
f_score = float(_nanaverage(f_score, weights=weights)) |
|
true_sum = None |
|
|
|
return precision, recall, f_score, true_sum |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like", "sparse matrix"], |
|
"y_pred": ["array-like", "sparse matrix"], |
|
"labels": ["array-like", None], |
|
"sample_weight": ["array-like", None], |
|
"raise_warning": ["boolean"], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def class_likelihood_ratios( |
|
y_true, |
|
y_pred, |
|
*, |
|
labels=None, |
|
sample_weight=None, |
|
raise_warning=True, |
|
): |
|
"""Compute binary classification positive and negative likelihood ratios. |
|
|
|
The positive likelihood ratio is `LR+ = sensitivity / (1 - specificity)` |
|
where the sensitivity or recall is the ratio `tp / (tp + fn)` and the |
|
specificity is `tn / (tn + fp)`. The negative likelihood ratio is `LR- = (1 |
|
- sensitivity) / specificity`. Here `tp` is the number of true positives, |
|
`fp` the number of false positives, `tn` is the number of true negatives and |
|
`fn` the number of false negatives. Both class likelihood ratios can be used |
|
to obtain post-test probabilities given a pre-test probability. |
|
|
|
`LR+` ranges from 1 to infinity. A `LR+` of 1 indicates that the probability |
|
of predicting the positive class is the same for samples belonging to either |
|
class; therefore, the test is useless. The greater `LR+` is, the more a |
|
positive prediction is likely to be a true positive when compared with the |
|
pre-test probability. A value of `LR+` lower than 1 is invalid as it would |
|
indicate that the odds of a sample being a true positive decrease with |
|
respect to the pre-test odds. |
|
|
|
`LR-` ranges from 0 to 1. The closer it is to 0, the lower the probability |
|
of a given sample to be a false negative. A `LR-` of 1 means the test is |
|
useless because the odds of having the condition did not change after the |
|
test. A value of `LR-` greater than 1 invalidates the classifier as it |
|
indicates an increase in the odds of a sample belonging to the positive |
|
class after being classified as negative. This is the case when the |
|
classifier systematically predicts the opposite of the true label. |
|
|
|
A typical application in medicine is to identify the positive/negative class |
|
to the presence/absence of a disease, respectively; the classifier being a |
|
diagnostic test; the pre-test probability of an individual having the |
|
disease can be the prevalence of such disease (proportion of a particular |
|
population found to be affected by a medical condition); and the post-test |
|
probabilities would be the probability that the condition is truly present |
|
given a positive test result. |
|
|
|
Read more in the :ref:`User Guide <class_likelihood_ratios>`. |
|
|
|
Parameters |
|
---------- |
|
y_true : 1d array-like, or label indicator array / sparse matrix |
|
Ground truth (correct) target values. |
|
|
|
y_pred : 1d array-like, or label indicator array / sparse matrix |
|
Estimated targets as returned by a classifier. |
|
|
|
labels : array-like, default=None |
|
List of labels to index the matrix. This may be used to select the |
|
positive and negative classes with the ordering `labels=[negative_class, |
|
positive_class]`. If `None` is given, those that appear at least once in |
|
`y_true` or `y_pred` are used in sorted order. |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
raise_warning : bool, default=True |
|
Whether or not a case-specific warning message is raised when there is a |
|
zero division. Even if the error is not raised, the function will return |
|
nan in such cases. |
|
|
|
Returns |
|
------- |
|
(positive_likelihood_ratio, negative_likelihood_ratio) : tuple |
|
A tuple of two float, the first containing the Positive likelihood ratio |
|
and the second the Negative likelihood ratio. |
|
|
|
Warns |
|
----- |
|
When `false positive == 0`, the positive likelihood ratio is undefined. |
|
When `true negative == 0`, the negative likelihood ratio is undefined. |
|
When `true positive + false negative == 0` both ratios are undefined. |
|
In such cases, `UserWarning` will be raised if raise_warning=True. |
|
|
|
References |
|
---------- |
|
.. [1] `Wikipedia entry for the Likelihood ratios in diagnostic testing |
|
<https://en.wikipedia.org/wiki/Likelihood_ratios_in_diagnostic_testing>`_. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.metrics import class_likelihood_ratios |
|
>>> class_likelihood_ratios([0, 1, 0, 1, 0], [1, 1, 0, 0, 0]) |
|
(np.float64(1.5), np.float64(0.75)) |
|
>>> y_true = np.array(["non-cat", "cat", "non-cat", "cat", "non-cat"]) |
|
>>> y_pred = np.array(["cat", "cat", "non-cat", "non-cat", "non-cat"]) |
|
>>> class_likelihood_ratios(y_true, y_pred) |
|
(np.float64(1.33...), np.float64(0.66...)) |
|
>>> y_true = np.array(["non-zebra", "zebra", "non-zebra", "zebra", "non-zebra"]) |
|
>>> y_pred = np.array(["zebra", "zebra", "non-zebra", "non-zebra", "non-zebra"]) |
|
>>> class_likelihood_ratios(y_true, y_pred) |
|
(np.float64(1.5), np.float64(0.75)) |
|
|
|
To avoid ambiguities, use the notation `labels=[negative_class, |
|
positive_class]` |
|
|
|
>>> y_true = np.array(["non-cat", "cat", "non-cat", "cat", "non-cat"]) |
|
>>> y_pred = np.array(["cat", "cat", "non-cat", "non-cat", "non-cat"]) |
|
>>> class_likelihood_ratios(y_true, y_pred, labels=["non-cat", "cat"]) |
|
(np.float64(1.5), np.float64(0.75)) |
|
""" |
|
y_true, y_pred = attach_unique(y_true, y_pred) |
|
y_type, y_true, y_pred = _check_targets(y_true, y_pred) |
|
if y_type != "binary": |
|
raise ValueError( |
|
"class_likelihood_ratios only supports binary classification " |
|
f"problems, got targets of type: {y_type}" |
|
) |
|
|
|
cm = confusion_matrix( |
|
y_true, |
|
y_pred, |
|
sample_weight=sample_weight, |
|
labels=labels, |
|
) |
|
|
|
|
|
|
|
|
|
if cm.shape == (1, 1): |
|
msg = "samples of only one class were seen during testing " |
|
if raise_warning: |
|
warnings.warn(msg, UserWarning, stacklevel=2) |
|
positive_likelihood_ratio = np.nan |
|
negative_likelihood_ratio = np.nan |
|
else: |
|
tn, fp, fn, tp = cm.ravel() |
|
support_pos = tp + fn |
|
support_neg = tn + fp |
|
pos_num = tp * support_neg |
|
pos_denom = fp * support_pos |
|
neg_num = fn * support_neg |
|
neg_denom = tn * support_pos |
|
|
|
|
|
if support_pos == 0: |
|
msg = "no samples of the positive class were present in the testing set " |
|
if raise_warning: |
|
warnings.warn(msg, UserWarning, stacklevel=2) |
|
positive_likelihood_ratio = np.nan |
|
negative_likelihood_ratio = np.nan |
|
if fp == 0: |
|
if tp == 0: |
|
msg = "no samples predicted for the positive class" |
|
else: |
|
msg = "positive_likelihood_ratio ill-defined and being set to nan " |
|
if raise_warning: |
|
warnings.warn(msg, UserWarning, stacklevel=2) |
|
positive_likelihood_ratio = np.nan |
|
else: |
|
positive_likelihood_ratio = pos_num / pos_denom |
|
if tn == 0: |
|
msg = "negative_likelihood_ratio ill-defined and being set to nan " |
|
if raise_warning: |
|
warnings.warn(msg, UserWarning, stacklevel=2) |
|
negative_likelihood_ratio = np.nan |
|
else: |
|
negative_likelihood_ratio = neg_num / neg_denom |
|
|
|
return positive_likelihood_ratio, negative_likelihood_ratio |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like", "sparse matrix"], |
|
"y_pred": ["array-like", "sparse matrix"], |
|
"labels": ["array-like", None], |
|
"pos_label": [Real, str, "boolean", None], |
|
"average": [ |
|
StrOptions({"micro", "macro", "samples", "weighted", "binary"}), |
|
None, |
|
], |
|
"sample_weight": ["array-like", None], |
|
"zero_division": [ |
|
Options(Real, {0.0, 1.0}), |
|
"nan", |
|
StrOptions({"warn"}), |
|
], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def precision_score( |
|
y_true, |
|
y_pred, |
|
*, |
|
labels=None, |
|
pos_label=1, |
|
average="binary", |
|
sample_weight=None, |
|
zero_division="warn", |
|
): |
|
"""Compute the precision. |
|
|
|
The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of |
|
true positives and ``fp`` the number of false positives. The precision is |
|
intuitively the ability of the classifier not to label as positive a sample |
|
that is negative. |
|
|
|
The best value is 1 and the worst value is 0. |
|
|
|
Support beyond term:`binary` targets is achieved by treating :term:`multiclass` |
|
and :term:`multilabel` data as a collection of binary problems, one for each |
|
label. For the :term:`binary` case, setting `average='binary'` will return |
|
precision for `pos_label`. If `average` is not `'binary'`, `pos_label` is ignored |
|
and precision for both classes are computed, then averaged or both returned (when |
|
`average=None`). Similarly, for :term:`multiclass` and :term:`multilabel` targets, |
|
precision for all `labels` are either returned or averaged depending on the |
|
`average` parameter. Use `labels` specify the set of labels to calculate precision |
|
for. |
|
|
|
Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`. |
|
|
|
Parameters |
|
---------- |
|
y_true : 1d array-like, or label indicator array / sparse matrix |
|
Ground truth (correct) target values. |
|
|
|
y_pred : 1d array-like, or label indicator array / sparse matrix |
|
Estimated targets as returned by a classifier. |
|
|
|
labels : array-like, default=None |
|
The set of labels to include when `average != 'binary'`, and their |
|
order if `average is None`. Labels present in the data can be |
|
excluded, for example in multiclass classification to exclude a "negative |
|
class". Labels not present in the data can be included and will be |
|
"assigned" 0 samples. For multilabel targets, labels are column indices. |
|
By default, all labels in `y_true` and `y_pred` are used in sorted order. |
|
|
|
.. versionchanged:: 0.17 |
|
Parameter `labels` improved for multiclass problem. |
|
|
|
pos_label : int, float, bool or str, default=1 |
|
The class to report if `average='binary'` and the data is binary, |
|
otherwise this parameter is ignored. |
|
For multiclass or multilabel targets, set `labels=[pos_label]` and |
|
`average != 'binary'` to report metrics for one label only. |
|
|
|
average : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, \ |
|
default='binary' |
|
This parameter is required for multiclass/multilabel targets. |
|
If ``None``, the metrics for each class are returned. Otherwise, this |
|
determines the type of averaging performed on the data: |
|
|
|
``'binary'``: |
|
Only report results for the class specified by ``pos_label``. |
|
This is applicable only if targets (``y_{true,pred}``) are binary. |
|
``'micro'``: |
|
Calculate metrics globally by counting the total true positives, |
|
false negatives and false positives. |
|
``'macro'``: |
|
Calculate metrics for each label, and find their unweighted |
|
mean. This does not take label imbalance into account. |
|
``'weighted'``: |
|
Calculate metrics for each label, and find their average weighted |
|
by support (the number of true instances for each label). This |
|
alters 'macro' to account for label imbalance; it can result in an |
|
F-score that is not between precision and recall. |
|
``'samples'``: |
|
Calculate metrics for each instance, and find their average (only |
|
meaningful for multilabel classification where this differs from |
|
:func:`accuracy_score`). |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
zero_division : {"warn", 0.0, 1.0, np.nan}, default="warn" |
|
Sets the value to return when there is a zero division. |
|
|
|
Notes: |
|
|
|
- If set to "warn", this acts like 0, but a warning is also raised. |
|
- If set to `np.nan`, such values will be excluded from the average. |
|
|
|
.. versionadded:: 1.3 |
|
`np.nan` option was added. |
|
|
|
Returns |
|
------- |
|
precision : float (if average is not None) or array of float of shape \ |
|
(n_unique_labels,) |
|
Precision of the positive class in binary classification or weighted |
|
average of the precision of each class for the multiclass task. |
|
|
|
See Also |
|
-------- |
|
precision_recall_fscore_support : Compute precision, recall, F-measure and |
|
support for each class. |
|
recall_score : Compute the ratio ``tp / (tp + fn)`` where ``tp`` is the |
|
number of true positives and ``fn`` the number of false negatives. |
|
PrecisionRecallDisplay.from_estimator : Plot precision-recall curve given |
|
an estimator and some data. |
|
PrecisionRecallDisplay.from_predictions : Plot precision-recall curve given |
|
binary class predictions. |
|
multilabel_confusion_matrix : Compute a confusion matrix for each class or |
|
sample. |
|
|
|
Notes |
|
----- |
|
When ``true positive + false positive == 0``, precision returns 0 and |
|
raises ``UndefinedMetricWarning``. This behavior can be |
|
modified with ``zero_division``. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.metrics import precision_score |
|
>>> y_true = [0, 1, 2, 0, 1, 2] |
|
>>> y_pred = [0, 2, 1, 0, 0, 1] |
|
>>> precision_score(y_true, y_pred, average='macro') |
|
0.22... |
|
>>> precision_score(y_true, y_pred, average='micro') |
|
0.33... |
|
>>> precision_score(y_true, y_pred, average='weighted') |
|
0.22... |
|
>>> precision_score(y_true, y_pred, average=None) |
|
array([0.66..., 0. , 0. ]) |
|
>>> y_pred = [0, 0, 0, 0, 0, 0] |
|
>>> precision_score(y_true, y_pred, average=None) |
|
array([0.33..., 0. , 0. ]) |
|
>>> precision_score(y_true, y_pred, average=None, zero_division=1) |
|
array([0.33..., 1. , 1. ]) |
|
>>> precision_score(y_true, y_pred, average=None, zero_division=np.nan) |
|
array([0.33..., nan, nan]) |
|
|
|
>>> # multilabel classification |
|
>>> y_true = [[0, 0, 0], [1, 1, 1], [0, 1, 1]] |
|
>>> y_pred = [[0, 0, 0], [1, 1, 1], [1, 1, 0]] |
|
>>> precision_score(y_true, y_pred, average=None) |
|
array([0.5, 1. , 1. ]) |
|
""" |
|
p, _, _, _ = precision_recall_fscore_support( |
|
y_true, |
|
y_pred, |
|
labels=labels, |
|
pos_label=pos_label, |
|
average=average, |
|
warn_for=("precision",), |
|
sample_weight=sample_weight, |
|
zero_division=zero_division, |
|
) |
|
return p |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like", "sparse matrix"], |
|
"y_pred": ["array-like", "sparse matrix"], |
|
"labels": ["array-like", None], |
|
"pos_label": [Real, str, "boolean", None], |
|
"average": [ |
|
StrOptions({"micro", "macro", "samples", "weighted", "binary"}), |
|
None, |
|
], |
|
"sample_weight": ["array-like", None], |
|
"zero_division": [ |
|
Options(Real, {0.0, 1.0}), |
|
"nan", |
|
StrOptions({"warn"}), |
|
], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def recall_score( |
|
y_true, |
|
y_pred, |
|
*, |
|
labels=None, |
|
pos_label=1, |
|
average="binary", |
|
sample_weight=None, |
|
zero_division="warn", |
|
): |
|
"""Compute the recall. |
|
|
|
The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of |
|
true positives and ``fn`` the number of false negatives. The recall is |
|
intuitively the ability of the classifier to find all the positive samples. |
|
|
|
The best value is 1 and the worst value is 0. |
|
|
|
Support beyond term:`binary` targets is achieved by treating :term:`multiclass` |
|
and :term:`multilabel` data as a collection of binary problems, one for each |
|
label. For the :term:`binary` case, setting `average='binary'` will return |
|
recall for `pos_label`. If `average` is not `'binary'`, `pos_label` is ignored |
|
and recall for both classes are computed then averaged or both returned (when |
|
`average=None`). Similarly, for :term:`multiclass` and :term:`multilabel` targets, |
|
recall for all `labels` are either returned or averaged depending on the `average` |
|
parameter. Use `labels` specify the set of labels to calculate recall for. |
|
|
|
Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`. |
|
|
|
Parameters |
|
---------- |
|
y_true : 1d array-like, or label indicator array / sparse matrix |
|
Ground truth (correct) target values. |
|
|
|
y_pred : 1d array-like, or label indicator array / sparse matrix |
|
Estimated targets as returned by a classifier. |
|
|
|
labels : array-like, default=None |
|
The set of labels to include when `average != 'binary'`, and their |
|
order if `average is None`. Labels present in the data can be |
|
excluded, for example in multiclass classification to exclude a "negative |
|
class". Labels not present in the data can be included and will be |
|
"assigned" 0 samples. For multilabel targets, labels are column indices. |
|
By default, all labels in `y_true` and `y_pred` are used in sorted order. |
|
|
|
.. versionchanged:: 0.17 |
|
Parameter `labels` improved for multiclass problem. |
|
|
|
pos_label : int, float, bool or str, default=1 |
|
The class to report if `average='binary'` and the data is binary, |
|
otherwise this parameter is ignored. |
|
For multiclass or multilabel targets, set `labels=[pos_label]` and |
|
`average != 'binary'` to report metrics for one label only. |
|
|
|
average : {'micro', 'macro', 'samples', 'weighted', 'binary'} or None, \ |
|
default='binary' |
|
This parameter is required for multiclass/multilabel targets. |
|
If ``None``, the metrics for each class are returned. Otherwise, this |
|
determines the type of averaging performed on the data: |
|
|
|
``'binary'``: |
|
Only report results for the class specified by ``pos_label``. |
|
This is applicable only if targets (``y_{true,pred}``) are binary. |
|
``'micro'``: |
|
Calculate metrics globally by counting the total true positives, |
|
false negatives and false positives. |
|
``'macro'``: |
|
Calculate metrics for each label, and find their unweighted |
|
mean. This does not take label imbalance into account. |
|
``'weighted'``: |
|
Calculate metrics for each label, and find their average weighted |
|
by support (the number of true instances for each label). This |
|
alters 'macro' to account for label imbalance; it can result in an |
|
F-score that is not between precision and recall. Weighted recall |
|
is equal to accuracy. |
|
``'samples'``: |
|
Calculate metrics for each instance, and find their average (only |
|
meaningful for multilabel classification where this differs from |
|
:func:`accuracy_score`). |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
zero_division : {"warn", 0.0, 1.0, np.nan}, default="warn" |
|
Sets the value to return when there is a zero division. |
|
|
|
Notes: |
|
|
|
- If set to "warn", this acts like 0, but a warning is also raised. |
|
- If set to `np.nan`, such values will be excluded from the average. |
|
|
|
.. versionadded:: 1.3 |
|
`np.nan` option was added. |
|
|
|
Returns |
|
------- |
|
recall : float (if average is not None) or array of float of shape \ |
|
(n_unique_labels,) |
|
Recall of the positive class in binary classification or weighted |
|
average of the recall of each class for the multiclass task. |
|
|
|
See Also |
|
-------- |
|
precision_recall_fscore_support : Compute precision, recall, F-measure and |
|
support for each class. |
|
precision_score : Compute the ratio ``tp / (tp + fp)`` where ``tp`` is the |
|
number of true positives and ``fp`` the number of false positives. |
|
balanced_accuracy_score : Compute balanced accuracy to deal with imbalanced |
|
datasets. |
|
multilabel_confusion_matrix : Compute a confusion matrix for each class or |
|
sample. |
|
PrecisionRecallDisplay.from_estimator : Plot precision-recall curve given |
|
an estimator and some data. |
|
PrecisionRecallDisplay.from_predictions : Plot precision-recall curve given |
|
binary class predictions. |
|
|
|
Notes |
|
----- |
|
When ``true positive + false negative == 0``, recall returns 0 and raises |
|
``UndefinedMetricWarning``. This behavior can be modified with |
|
``zero_division``. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.metrics import recall_score |
|
>>> y_true = [0, 1, 2, 0, 1, 2] |
|
>>> y_pred = [0, 2, 1, 0, 0, 1] |
|
>>> recall_score(y_true, y_pred, average='macro') |
|
0.33... |
|
>>> recall_score(y_true, y_pred, average='micro') |
|
0.33... |
|
>>> recall_score(y_true, y_pred, average='weighted') |
|
0.33... |
|
>>> recall_score(y_true, y_pred, average=None) |
|
array([1., 0., 0.]) |
|
>>> y_true = [0, 0, 0, 0, 0, 0] |
|
>>> recall_score(y_true, y_pred, average=None) |
|
array([0.5, 0. , 0. ]) |
|
>>> recall_score(y_true, y_pred, average=None, zero_division=1) |
|
array([0.5, 1. , 1. ]) |
|
>>> recall_score(y_true, y_pred, average=None, zero_division=np.nan) |
|
array([0.5, nan, nan]) |
|
|
|
>>> # multilabel classification |
|
>>> y_true = [[0, 0, 0], [1, 1, 1], [0, 1, 1]] |
|
>>> y_pred = [[0, 0, 0], [1, 1, 1], [1, 1, 0]] |
|
>>> recall_score(y_true, y_pred, average=None) |
|
array([1. , 1. , 0.5]) |
|
""" |
|
_, r, _, _ = precision_recall_fscore_support( |
|
y_true, |
|
y_pred, |
|
labels=labels, |
|
pos_label=pos_label, |
|
average=average, |
|
warn_for=("recall",), |
|
sample_weight=sample_weight, |
|
zero_division=zero_division, |
|
) |
|
return r |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like"], |
|
"y_pred": ["array-like"], |
|
"sample_weight": ["array-like", None], |
|
"adjusted": ["boolean"], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def balanced_accuracy_score(y_true, y_pred, *, sample_weight=None, adjusted=False): |
|
"""Compute the balanced accuracy. |
|
|
|
The balanced accuracy in binary and multiclass classification problems to |
|
deal with imbalanced datasets. It is defined as the average of recall |
|
obtained on each class. |
|
|
|
The best value is 1 and the worst value is 0 when ``adjusted=False``. |
|
|
|
Read more in the :ref:`User Guide <balanced_accuracy_score>`. |
|
|
|
.. versionadded:: 0.20 |
|
|
|
Parameters |
|
---------- |
|
y_true : array-like of shape (n_samples,) |
|
Ground truth (correct) target values. |
|
|
|
y_pred : array-like of shape (n_samples,) |
|
Estimated targets as returned by a classifier. |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
adjusted : bool, default=False |
|
When true, the result is adjusted for chance, so that random |
|
performance would score 0, while keeping perfect performance at a score |
|
of 1. |
|
|
|
Returns |
|
------- |
|
balanced_accuracy : float |
|
Balanced accuracy score. |
|
|
|
See Also |
|
-------- |
|
average_precision_score : Compute average precision (AP) from prediction |
|
scores. |
|
precision_score : Compute the precision score. |
|
recall_score : Compute the recall score. |
|
roc_auc_score : Compute Area Under the Receiver Operating Characteristic |
|
Curve (ROC AUC) from prediction scores. |
|
|
|
Notes |
|
----- |
|
Some literature promotes alternative definitions of balanced accuracy. Our |
|
definition is equivalent to :func:`accuracy_score` with class-balanced |
|
sample weights, and shares desirable properties with the binary case. |
|
See the :ref:`User Guide <balanced_accuracy_score>`. |
|
|
|
References |
|
---------- |
|
.. [1] Brodersen, K.H.; Ong, C.S.; Stephan, K.E.; Buhmann, J.M. (2010). |
|
The balanced accuracy and its posterior distribution. |
|
Proceedings of the 20th International Conference on Pattern |
|
Recognition, 3121-24. |
|
.. [2] John. D. Kelleher, Brian Mac Namee, Aoife D'Arcy, (2015). |
|
`Fundamentals of Machine Learning for Predictive Data Analytics: |
|
Algorithms, Worked Examples, and Case Studies |
|
<https://mitpress.mit.edu/books/fundamentals-machine-learning-predictive-data-analytics>`_. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics import balanced_accuracy_score |
|
>>> y_true = [0, 1, 0, 0, 1, 0] |
|
>>> y_pred = [0, 1, 0, 0, 0, 1] |
|
>>> balanced_accuracy_score(y_true, y_pred) |
|
np.float64(0.625) |
|
""" |
|
C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight) |
|
with np.errstate(divide="ignore", invalid="ignore"): |
|
per_class = np.diag(C) / C.sum(axis=1) |
|
if np.any(np.isnan(per_class)): |
|
warnings.warn("y_pred contains classes not in y_true") |
|
per_class = per_class[~np.isnan(per_class)] |
|
score = np.mean(per_class) |
|
if adjusted: |
|
n_classes = len(per_class) |
|
chance = 1 / n_classes |
|
score -= chance |
|
score /= 1 - chance |
|
return score |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like", "sparse matrix"], |
|
"y_pred": ["array-like", "sparse matrix"], |
|
"labels": ["array-like", None], |
|
"target_names": ["array-like", None], |
|
"sample_weight": ["array-like", None], |
|
"digits": [Interval(Integral, 0, None, closed="left")], |
|
"output_dict": ["boolean"], |
|
"zero_division": [ |
|
Options(Real, {0.0, 1.0}), |
|
"nan", |
|
StrOptions({"warn"}), |
|
], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def classification_report( |
|
y_true, |
|
y_pred, |
|
*, |
|
labels=None, |
|
target_names=None, |
|
sample_weight=None, |
|
digits=2, |
|
output_dict=False, |
|
zero_division="warn", |
|
): |
|
"""Build a text report showing the main classification metrics. |
|
|
|
Read more in the :ref:`User Guide <classification_report>`. |
|
|
|
Parameters |
|
---------- |
|
y_true : 1d array-like, or label indicator array / sparse matrix |
|
Ground truth (correct) target values. |
|
|
|
y_pred : 1d array-like, or label indicator array / sparse matrix |
|
Estimated targets as returned by a classifier. |
|
|
|
labels : array-like of shape (n_labels,), default=None |
|
Optional list of label indices to include in the report. |
|
|
|
target_names : array-like of shape (n_labels,), default=None |
|
Optional display names matching the labels (same order). |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
digits : int, default=2 |
|
Number of digits for formatting output floating point values. |
|
When ``output_dict`` is ``True``, this will be ignored and the |
|
returned values will not be rounded. |
|
|
|
output_dict : bool, default=False |
|
If True, return output as dict. |
|
|
|
.. versionadded:: 0.20 |
|
|
|
zero_division : {"warn", 0.0, 1.0, np.nan}, default="warn" |
|
Sets the value to return when there is a zero division. If set to |
|
"warn", this acts as 0, but warnings are also raised. |
|
|
|
.. versionadded:: 1.3 |
|
`np.nan` option was added. |
|
|
|
Returns |
|
------- |
|
report : str or dict |
|
Text summary of the precision, recall, F1 score for each class. |
|
Dictionary returned if output_dict is True. Dictionary has the |
|
following structure:: |
|
|
|
{'label 1': {'precision':0.5, |
|
'recall':1.0, |
|
'f1-score':0.67, |
|
'support':1}, |
|
'label 2': { ... }, |
|
... |
|
} |
|
|
|
The reported averages include macro average (averaging the unweighted |
|
mean per label), weighted average (averaging the support-weighted mean |
|
per label), and sample average (only for multilabel classification). |
|
Micro average (averaging the total true positives, false negatives and |
|
false positives) is only shown for multi-label or multi-class |
|
with a subset of classes, because it corresponds to accuracy |
|
otherwise and would be the same for all metrics. |
|
See also :func:`precision_recall_fscore_support` for more details |
|
on averages. |
|
|
|
Note that in binary classification, recall of the positive class |
|
is also known as "sensitivity"; recall of the negative class is |
|
"specificity". |
|
|
|
See Also |
|
-------- |
|
precision_recall_fscore_support: Compute precision, recall, F-measure and |
|
support for each class. |
|
confusion_matrix: Compute confusion matrix to evaluate the accuracy of a |
|
classification. |
|
multilabel_confusion_matrix: Compute a confusion matrix for each class or sample. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics import classification_report |
|
>>> y_true = [0, 1, 2, 2, 2] |
|
>>> y_pred = [0, 0, 2, 2, 1] |
|
>>> target_names = ['class 0', 'class 1', 'class 2'] |
|
>>> print(classification_report(y_true, y_pred, target_names=target_names)) |
|
precision recall f1-score support |
|
<BLANKLINE> |
|
class 0 0.50 1.00 0.67 1 |
|
class 1 0.00 0.00 0.00 1 |
|
class 2 1.00 0.67 0.80 3 |
|
<BLANKLINE> |
|
accuracy 0.60 5 |
|
macro avg 0.50 0.56 0.49 5 |
|
weighted avg 0.70 0.60 0.61 5 |
|
<BLANKLINE> |
|
>>> y_pred = [1, 1, 0] |
|
>>> y_true = [1, 1, 1] |
|
>>> print(classification_report(y_true, y_pred, labels=[1, 2, 3])) |
|
precision recall f1-score support |
|
<BLANKLINE> |
|
1 1.00 0.67 0.80 3 |
|
2 0.00 0.00 0.00 0 |
|
3 0.00 0.00 0.00 0 |
|
<BLANKLINE> |
|
micro avg 1.00 0.67 0.80 3 |
|
macro avg 0.33 0.22 0.27 3 |
|
weighted avg 1.00 0.67 0.80 3 |
|
<BLANKLINE> |
|
""" |
|
|
|
y_true, y_pred = attach_unique(y_true, y_pred) |
|
y_type, y_true, y_pred = _check_targets(y_true, y_pred) |
|
|
|
if labels is None: |
|
labels = unique_labels(y_true, y_pred) |
|
labels_given = False |
|
else: |
|
labels = np.asarray(labels) |
|
labels_given = True |
|
|
|
|
|
micro_is_accuracy = (y_type == "multiclass" or y_type == "binary") and ( |
|
not labels_given or (set(labels) >= set(unique_labels(y_true, y_pred))) |
|
) |
|
|
|
if target_names is not None and len(labels) != len(target_names): |
|
if labels_given: |
|
warnings.warn( |
|
"labels size, {0}, does not match size of target_names, {1}".format( |
|
len(labels), len(target_names) |
|
) |
|
) |
|
else: |
|
raise ValueError( |
|
"Number of classes, {0}, does not match size of " |
|
"target_names, {1}. Try specifying the labels " |
|
"parameter".format(len(labels), len(target_names)) |
|
) |
|
if target_names is None: |
|
target_names = ["%s" % l for l in labels] |
|
|
|
headers = ["precision", "recall", "f1-score", "support"] |
|
|
|
p, r, f1, s = precision_recall_fscore_support( |
|
y_true, |
|
y_pred, |
|
labels=labels, |
|
average=None, |
|
sample_weight=sample_weight, |
|
zero_division=zero_division, |
|
) |
|
rows = zip(target_names, p, r, f1, s) |
|
|
|
if y_type.startswith("multilabel"): |
|
average_options = ("micro", "macro", "weighted", "samples") |
|
else: |
|
average_options = ("micro", "macro", "weighted") |
|
|
|
if output_dict: |
|
report_dict = {label[0]: label[1:] for label in rows} |
|
for label, scores in report_dict.items(): |
|
report_dict[label] = dict(zip(headers, [float(i) for i in scores])) |
|
else: |
|
longest_last_line_heading = "weighted avg" |
|
name_width = max(len(cn) for cn in target_names) |
|
width = max(name_width, len(longest_last_line_heading), digits) |
|
head_fmt = "{:>{width}s} " + " {:>9}" * len(headers) |
|
report = head_fmt.format("", *headers, width=width) |
|
report += "\n\n" |
|
row_fmt = "{:>{width}s} " + " {:>9.{digits}f}" * 3 + " {:>9}\n" |
|
for row in rows: |
|
report += row_fmt.format(*row, width=width, digits=digits) |
|
report += "\n" |
|
|
|
|
|
for average in average_options: |
|
if average.startswith("micro") and micro_is_accuracy: |
|
line_heading = "accuracy" |
|
else: |
|
line_heading = average + " avg" |
|
|
|
|
|
avg_p, avg_r, avg_f1, _ = precision_recall_fscore_support( |
|
y_true, |
|
y_pred, |
|
labels=labels, |
|
average=average, |
|
sample_weight=sample_weight, |
|
zero_division=zero_division, |
|
) |
|
avg = [avg_p, avg_r, avg_f1, np.sum(s)] |
|
|
|
if output_dict: |
|
report_dict[line_heading] = dict(zip(headers, [float(i) for i in avg])) |
|
else: |
|
if line_heading == "accuracy": |
|
row_fmt_accuracy = ( |
|
"{:>{width}s} " |
|
+ " {:>9.{digits}}" * 2 |
|
+ " {:>9.{digits}f}" |
|
+ " {:>9}\n" |
|
) |
|
report += row_fmt_accuracy.format( |
|
line_heading, "", "", *avg[2:], width=width, digits=digits |
|
) |
|
else: |
|
report += row_fmt.format(line_heading, *avg, width=width, digits=digits) |
|
|
|
if output_dict: |
|
if "accuracy" in report_dict.keys(): |
|
report_dict["accuracy"] = report_dict["accuracy"]["precision"] |
|
return report_dict |
|
else: |
|
return report |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like", "sparse matrix"], |
|
"y_pred": ["array-like", "sparse matrix"], |
|
"sample_weight": ["array-like", None], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def hamming_loss(y_true, y_pred, *, sample_weight=None): |
|
"""Compute the average Hamming loss. |
|
|
|
The Hamming loss is the fraction of labels that are incorrectly predicted. |
|
|
|
Read more in the :ref:`User Guide <hamming_loss>`. |
|
|
|
Parameters |
|
---------- |
|
y_true : 1d array-like, or label indicator array / sparse matrix |
|
Ground truth (correct) labels. |
|
|
|
y_pred : 1d array-like, or label indicator array / sparse matrix |
|
Predicted labels, as returned by a classifier. |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
.. versionadded:: 0.18 |
|
|
|
Returns |
|
------- |
|
loss : float or int |
|
Return the average Hamming loss between element of ``y_true`` and |
|
``y_pred``. |
|
|
|
See Also |
|
-------- |
|
accuracy_score : Compute the accuracy score. By default, the function will |
|
return the fraction of correct predictions divided by the total number |
|
of predictions. |
|
jaccard_score : Compute the Jaccard similarity coefficient score. |
|
zero_one_loss : Compute the Zero-one classification loss. By default, the |
|
function will return the percentage of imperfectly predicted subsets. |
|
|
|
Notes |
|
----- |
|
In multiclass classification, the Hamming loss corresponds to the Hamming |
|
distance between ``y_true`` and ``y_pred`` which is equivalent to the |
|
subset ``zero_one_loss`` function, when `normalize` parameter is set to |
|
True. |
|
|
|
In multilabel classification, the Hamming loss is different from the |
|
subset zero-one loss. The zero-one loss considers the entire set of labels |
|
for a given sample incorrect if it does not entirely match the true set of |
|
labels. Hamming loss is more forgiving in that it penalizes only the |
|
individual labels. |
|
|
|
The Hamming loss is upperbounded by the subset zero-one loss, when |
|
`normalize` parameter is set to True. It is always between 0 and 1, |
|
lower being better. |
|
|
|
References |
|
---------- |
|
.. [1] Grigorios Tsoumakas, Ioannis Katakis. Multi-Label Classification: |
|
An Overview. International Journal of Data Warehousing & Mining, |
|
3(3), 1-13, July-September 2007. |
|
|
|
.. [2] `Wikipedia entry on the Hamming distance |
|
<https://en.wikipedia.org/wiki/Hamming_distance>`_. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics import hamming_loss |
|
>>> y_pred = [1, 2, 3, 4] |
|
>>> y_true = [2, 2, 3, 4] |
|
>>> hamming_loss(y_true, y_pred) |
|
0.25 |
|
|
|
In the multilabel case with binary label indicators: |
|
|
|
>>> import numpy as np |
|
>>> hamming_loss(np.array([[0, 1], [1, 1]]), np.zeros((2, 2))) |
|
0.75 |
|
""" |
|
y_true, y_pred = attach_unique(y_true, y_pred) |
|
y_type, y_true, y_pred = _check_targets(y_true, y_pred) |
|
check_consistent_length(y_true, y_pred, sample_weight) |
|
|
|
if sample_weight is None: |
|
weight_average = 1.0 |
|
else: |
|
weight_average = np.mean(sample_weight) |
|
|
|
if y_type.startswith("multilabel"): |
|
n_differences = count_nonzero(y_true - y_pred, sample_weight=sample_weight) |
|
return n_differences / (y_true.shape[0] * y_true.shape[1] * weight_average) |
|
|
|
elif y_type in ["binary", "multiclass"]: |
|
return float(_average(y_true != y_pred, weights=sample_weight, normalize=True)) |
|
else: |
|
raise ValueError("{0} is not supported".format(y_type)) |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like"], |
|
"y_pred": ["array-like"], |
|
"normalize": ["boolean"], |
|
"sample_weight": ["array-like", None], |
|
"labels": ["array-like", None], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def log_loss(y_true, y_pred, *, normalize=True, sample_weight=None, labels=None): |
|
r"""Log loss, aka logistic loss or cross-entropy loss. |
|
|
|
This is the loss function used in (multinomial) logistic regression |
|
and extensions of it such as neural networks, defined as the negative |
|
log-likelihood of a logistic model that returns ``y_pred`` probabilities |
|
for its training data ``y_true``. |
|
The log loss is only defined for two or more labels. |
|
For a single sample with true label :math:`y \in \{0,1\}` and |
|
a probability estimate :math:`p = \operatorname{Pr}(y = 1)`, the log |
|
loss is: |
|
|
|
.. math:: |
|
L_{\log}(y, p) = -(y \log (p) + (1 - y) \log (1 - p)) |
|
|
|
Read more in the :ref:`User Guide <log_loss>`. |
|
|
|
Parameters |
|
---------- |
|
y_true : array-like or label indicator matrix |
|
Ground truth (correct) labels for n_samples samples. |
|
|
|
y_pred : array-like of float, shape = (n_samples, n_classes) or (n_samples,) |
|
Predicted probabilities, as returned by a classifier's |
|
predict_proba method. If ``y_pred.shape = (n_samples,)`` |
|
the probabilities provided are assumed to be that of the |
|
positive class. The labels in ``y_pred`` are assumed to be |
|
ordered alphabetically, as done by |
|
:class:`~sklearn.preprocessing.LabelBinarizer`. |
|
|
|
`y_pred` values are clipped to `[eps, 1-eps]` where `eps` is the machine |
|
precision for `y_pred`'s dtype. |
|
|
|
normalize : bool, default=True |
|
If true, return the mean loss per sample. |
|
Otherwise, return the sum of the per-sample losses. |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
labels : array-like, default=None |
|
If not provided, labels will be inferred from y_true. If ``labels`` |
|
is ``None`` and ``y_pred`` has shape (n_samples,) the labels are |
|
assumed to be binary and are inferred from ``y_true``. |
|
|
|
.. versionadded:: 0.18 |
|
|
|
Returns |
|
------- |
|
loss : float |
|
Log loss, aka logistic loss or cross-entropy loss. |
|
|
|
Notes |
|
----- |
|
The logarithm used is the natural logarithm (base-e). |
|
|
|
References |
|
---------- |
|
C.M. Bishop (2006). Pattern Recognition and Machine Learning. Springer, |
|
p. 209. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.metrics import log_loss |
|
>>> log_loss(["spam", "ham", "ham", "spam"], |
|
... [[.1, .9], [.9, .1], [.8, .2], [.35, .65]]) |
|
0.21616... |
|
""" |
|
y_pred = check_array( |
|
y_pred, ensure_2d=False, dtype=[np.float64, np.float32, np.float16] |
|
) |
|
|
|
check_consistent_length(y_pred, y_true, sample_weight) |
|
lb = LabelBinarizer() |
|
|
|
if labels is not None: |
|
lb.fit(labels) |
|
else: |
|
lb.fit(y_true) |
|
|
|
if len(lb.classes_) == 1: |
|
if labels is None: |
|
raise ValueError( |
|
"y_true contains only one label ({0}). Please " |
|
"provide the true labels explicitly through the " |
|
"labels argument.".format(lb.classes_[0]) |
|
) |
|
else: |
|
raise ValueError( |
|
"The labels array needs to contain at least two " |
|
"labels for log_loss, " |
|
"got {0}.".format(lb.classes_) |
|
) |
|
|
|
transformed_labels = lb.transform(y_true) |
|
|
|
if transformed_labels.shape[1] == 1: |
|
transformed_labels = np.append( |
|
1 - transformed_labels, transformed_labels, axis=1 |
|
) |
|
|
|
|
|
|
|
if y_pred.ndim == 1: |
|
y_pred = y_pred[:, np.newaxis] |
|
if y_pred.shape[1] == 1: |
|
y_pred = np.append(1 - y_pred, y_pred, axis=1) |
|
|
|
eps = np.finfo(y_pred.dtype).eps |
|
|
|
|
|
y_pred_sum = y_pred.sum(axis=1) |
|
if not np.allclose(y_pred_sum, 1, rtol=np.sqrt(eps)): |
|
warnings.warn( |
|
"The y_pred values do not sum to one. Make sure to pass probabilities.", |
|
UserWarning, |
|
) |
|
|
|
|
|
y_pred = np.clip(y_pred, eps, 1 - eps) |
|
|
|
|
|
transformed_labels = check_array(transformed_labels) |
|
if len(lb.classes_) != y_pred.shape[1]: |
|
if labels is None: |
|
raise ValueError( |
|
"y_true and y_pred contain different number of " |
|
"classes {0}, {1}. Please provide the true " |
|
"labels explicitly through the labels argument. " |
|
"Classes found in " |
|
"y_true: {2}".format( |
|
transformed_labels.shape[1], y_pred.shape[1], lb.classes_ |
|
) |
|
) |
|
else: |
|
raise ValueError( |
|
"The number of classes in labels is different " |
|
"from that in y_pred. Classes found in " |
|
"labels: {0}".format(lb.classes_) |
|
) |
|
|
|
loss = -xlogy(transformed_labels, y_pred).sum(axis=1) |
|
|
|
return float(_average(loss, weights=sample_weight, normalize=normalize)) |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like"], |
|
"pred_decision": ["array-like"], |
|
"labels": ["array-like", None], |
|
"sample_weight": ["array-like", None], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def hinge_loss(y_true, pred_decision, *, labels=None, sample_weight=None): |
|
"""Average hinge loss (non-regularized). |
|
|
|
In binary class case, assuming labels in y_true are encoded with +1 and -1, |
|
when a prediction mistake is made, ``margin = y_true * pred_decision`` is |
|
always negative (since the signs disagree), implying ``1 - margin`` is |
|
always greater than 1. The cumulated hinge loss is therefore an upper |
|
bound of the number of mistakes made by the classifier. |
|
|
|
In multiclass case, the function expects that either all the labels are |
|
included in y_true or an optional labels argument is provided which |
|
contains all the labels. The multilabel margin is calculated according |
|
to Crammer-Singer's method. As in the binary case, the cumulated hinge loss |
|
is an upper bound of the number of mistakes made by the classifier. |
|
|
|
Read more in the :ref:`User Guide <hinge_loss>`. |
|
|
|
Parameters |
|
---------- |
|
y_true : array-like of shape (n_samples,) |
|
True target, consisting of integers of two values. The positive label |
|
must be greater than the negative label. |
|
|
|
pred_decision : array-like of shape (n_samples,) or (n_samples, n_classes) |
|
Predicted decisions, as output by decision_function (floats). |
|
|
|
labels : array-like, default=None |
|
Contains all the labels for the problem. Used in multiclass hinge loss. |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
Returns |
|
------- |
|
loss : float |
|
Average hinge loss. |
|
|
|
References |
|
---------- |
|
.. [1] `Wikipedia entry on the Hinge loss |
|
<https://en.wikipedia.org/wiki/Hinge_loss>`_. |
|
|
|
.. [2] Koby Crammer, Yoram Singer. On the Algorithmic |
|
Implementation of Multiclass Kernel-based Vector |
|
Machines. Journal of Machine Learning Research 2, |
|
(2001), 265-292. |
|
|
|
.. [3] `L1 AND L2 Regularization for Multiclass Hinge Loss Models |
|
by Robert C. Moore, John DeNero |
|
<https://storage.googleapis.com/pub-tools-public-publication-data/pdf/37362.pdf>`_. |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn import svm |
|
>>> from sklearn.metrics import hinge_loss |
|
>>> X = [[0], [1]] |
|
>>> y = [-1, 1] |
|
>>> est = svm.LinearSVC(random_state=0) |
|
>>> est.fit(X, y) |
|
LinearSVC(random_state=0) |
|
>>> pred_decision = est.decision_function([[-2], [3], [0.5]]) |
|
>>> pred_decision |
|
array([-2.18..., 2.36..., 0.09...]) |
|
>>> hinge_loss([-1, 1, 1], pred_decision) |
|
np.float64(0.30...) |
|
|
|
In the multiclass case: |
|
|
|
>>> import numpy as np |
|
>>> X = np.array([[0], [1], [2], [3]]) |
|
>>> Y = np.array([0, 1, 2, 3]) |
|
>>> labels = np.array([0, 1, 2, 3]) |
|
>>> est = svm.LinearSVC() |
|
>>> est.fit(X, Y) |
|
LinearSVC() |
|
>>> pred_decision = est.decision_function([[-1], [2], [3]]) |
|
>>> y_true = [0, 2, 3] |
|
>>> hinge_loss(y_true, pred_decision, labels=labels) |
|
np.float64(0.56...) |
|
""" |
|
check_consistent_length(y_true, pred_decision, sample_weight) |
|
pred_decision = check_array(pred_decision, ensure_2d=False) |
|
y_true = column_or_1d(y_true) |
|
y_true_unique = np.unique(labels if labels is not None else y_true) |
|
|
|
if y_true_unique.size > 2: |
|
if pred_decision.ndim <= 1: |
|
raise ValueError( |
|
"The shape of pred_decision cannot be 1d array" |
|
"with a multiclass target. pred_decision shape " |
|
"must be (n_samples, n_classes), that is " |
|
f"({y_true.shape[0]}, {y_true_unique.size})." |
|
f" Got: {pred_decision.shape}" |
|
) |
|
|
|
|
|
if y_true_unique.size != pred_decision.shape[1]: |
|
if labels is None: |
|
raise ValueError( |
|
"Please include all labels in y_true " |
|
"or pass labels as third argument" |
|
) |
|
else: |
|
raise ValueError( |
|
"The shape of pred_decision is not " |
|
"consistent with the number of classes. " |
|
"With a multiclass target, pred_decision " |
|
"shape must be " |
|
"(n_samples, n_classes), that is " |
|
f"({y_true.shape[0]}, {y_true_unique.size}). " |
|
f"Got: {pred_decision.shape}" |
|
) |
|
if labels is None: |
|
labels = y_true_unique |
|
le = LabelEncoder() |
|
le.fit(labels) |
|
y_true = le.transform(y_true) |
|
mask = np.ones_like(pred_decision, dtype=bool) |
|
mask[np.arange(y_true.shape[0]), y_true] = False |
|
margin = pred_decision[~mask] |
|
margin -= np.max(pred_decision[mask].reshape(y_true.shape[0], -1), axis=1) |
|
|
|
else: |
|
|
|
|
|
|
|
pred_decision = column_or_1d(pred_decision) |
|
pred_decision = np.ravel(pred_decision) |
|
|
|
lbin = LabelBinarizer(neg_label=-1) |
|
y_true = lbin.fit_transform(y_true)[:, 0] |
|
|
|
try: |
|
margin = y_true * pred_decision |
|
except TypeError: |
|
raise TypeError("pred_decision should be an array of floats.") |
|
|
|
losses = 1 - margin |
|
|
|
np.clip(losses, 0, None, out=losses) |
|
return np.average(losses, weights=sample_weight) |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like"], |
|
"y_proba": ["array-like", Hidden(None)], |
|
"sample_weight": ["array-like", None], |
|
"pos_label": [Real, str, "boolean", None], |
|
"y_prob": ["array-like", Hidden(StrOptions({"deprecated"}))], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def brier_score_loss( |
|
y_true, y_proba=None, *, sample_weight=None, pos_label=None, y_prob="deprecated" |
|
): |
|
"""Compute the Brier score loss. |
|
|
|
The smaller the Brier score loss, the better, hence the naming with "loss". |
|
The Brier score measures the mean squared difference between the predicted |
|
probability and the actual outcome. The Brier score always |
|
takes on a value between zero and one, since this is the largest |
|
possible difference between a predicted probability (which must be |
|
between zero and one) and the actual outcome (which can take on values |
|
of only 0 and 1). It can be decomposed as the sum of refinement loss and |
|
calibration loss. |
|
|
|
The Brier score is appropriate for binary and categorical outcomes that |
|
can be structured as true or false, but is inappropriate for ordinal |
|
variables which can take on three or more values (this is because the |
|
Brier score assumes that all possible outcomes are equivalently |
|
"distant" from one another). Which label is considered to be the positive |
|
label is controlled via the parameter `pos_label`, which defaults to |
|
the greater label unless `y_true` is all 0 or all -1, in which case |
|
`pos_label` defaults to 1. |
|
|
|
Read more in the :ref:`User Guide <brier_score_loss>`. |
|
|
|
Parameters |
|
---------- |
|
y_true : array-like of shape (n_samples,) |
|
True targets. |
|
|
|
y_proba : array-like of shape (n_samples,) |
|
Probabilities of the positive class. |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
pos_label : int, float, bool or str, default=None |
|
Label of the positive class. `pos_label` will be inferred in the |
|
following manner: |
|
|
|
* if `y_true` in {-1, 1} or {0, 1}, `pos_label` defaults to 1; |
|
* else if `y_true` contains string, an error will be raised and |
|
`pos_label` should be explicitly specified; |
|
* otherwise, `pos_label` defaults to the greater label, |
|
i.e. `np.unique(y_true)[-1]`. |
|
|
|
y_prob : array-like of shape (n_samples,) |
|
Probabilities of the positive class. |
|
|
|
.. deprecated:: 1.5 |
|
`y_prob` is deprecated and will be removed in 1.7. Use |
|
`y_proba` instead. |
|
|
|
Returns |
|
------- |
|
score : float |
|
Brier score loss. |
|
|
|
References |
|
---------- |
|
.. [1] `Wikipedia entry for the Brier score |
|
<https://en.wikipedia.org/wiki/Brier_score>`_. |
|
|
|
Examples |
|
-------- |
|
>>> import numpy as np |
|
>>> from sklearn.metrics import brier_score_loss |
|
>>> y_true = np.array([0, 1, 1, 0]) |
|
>>> y_true_categorical = np.array(["spam", "ham", "ham", "spam"]) |
|
>>> y_prob = np.array([0.1, 0.9, 0.8, 0.3]) |
|
>>> brier_score_loss(y_true, y_prob) |
|
np.float64(0.037...) |
|
>>> brier_score_loss(y_true, 1-y_prob, pos_label=0) |
|
np.float64(0.037...) |
|
>>> brier_score_loss(y_true_categorical, y_prob, pos_label="ham") |
|
np.float64(0.037...) |
|
>>> brier_score_loss(y_true, np.array(y_prob) > 0.5) |
|
np.float64(0.0) |
|
""" |
|
|
|
|
|
|
|
if y_proba is not None and not isinstance(y_prob, str): |
|
raise ValueError( |
|
"`y_prob` and `y_proba` cannot be both specified. Please use `y_proba` only" |
|
" as `y_prob` is deprecated in v1.5 and will be removed in v1.7." |
|
) |
|
if y_proba is None: |
|
warnings.warn( |
|
( |
|
"y_prob was deprecated in version 1.5 and will be removed in 1.7." |
|
"Please use ``y_proba`` instead." |
|
), |
|
FutureWarning, |
|
) |
|
y_proba = y_prob |
|
|
|
y_true = column_or_1d(y_true) |
|
y_proba = column_or_1d(y_proba) |
|
assert_all_finite(y_true) |
|
assert_all_finite(y_proba) |
|
check_consistent_length(y_true, y_proba, sample_weight) |
|
|
|
y_type = type_of_target(y_true, input_name="y_true") |
|
if y_type != "binary": |
|
raise ValueError( |
|
"Only binary classification is supported. The type of the target " |
|
f"is {y_type}." |
|
) |
|
|
|
if y_proba.max() > 1: |
|
raise ValueError("y_proba contains values greater than 1.") |
|
if y_proba.min() < 0: |
|
raise ValueError("y_proba contains values less than 0.") |
|
|
|
try: |
|
pos_label = _check_pos_label_consistency(pos_label, y_true) |
|
except ValueError: |
|
classes = np.unique(y_true) |
|
if classes.dtype.kind not in ("O", "U", "S"): |
|
|
|
|
|
pos_label = classes[-1] |
|
else: |
|
raise |
|
y_true = np.array(y_true == pos_label, int) |
|
return np.average((y_true - y_proba) ** 2, weights=sample_weight) |
|
|
|
|
|
@validate_params( |
|
{ |
|
"y_true": ["array-like"], |
|
"y_pred": ["array-like"], |
|
"sample_weight": ["array-like", None], |
|
"labels": ["array-like", None], |
|
}, |
|
prefer_skip_nested_validation=True, |
|
) |
|
def d2_log_loss_score(y_true, y_pred, *, sample_weight=None, labels=None): |
|
""" |
|
:math:`D^2` score function, fraction of log loss explained. |
|
|
|
Best possible score is 1.0 and it can be negative (because the model can be |
|
arbitrarily worse). A model that always predicts the per-class proportions |
|
of `y_true`, disregarding the input features, gets a D^2 score of 0.0. |
|
|
|
Read more in the :ref:`User Guide <d2_score_classification>`. |
|
|
|
.. versionadded:: 1.5 |
|
|
|
Parameters |
|
---------- |
|
y_true : array-like or label indicator matrix |
|
The actuals labels for the n_samples samples. |
|
|
|
y_pred : array-like of shape (n_samples, n_classes) or (n_samples,) |
|
Predicted probabilities, as returned by a classifier's |
|
predict_proba method. If ``y_pred.shape = (n_samples,)`` |
|
the probabilities provided are assumed to be that of the |
|
positive class. The labels in ``y_pred`` are assumed to be |
|
ordered alphabetically, as done by |
|
:class:`~sklearn.preprocessing.LabelBinarizer`. |
|
|
|
sample_weight : array-like of shape (n_samples,), default=None |
|
Sample weights. |
|
|
|
labels : array-like, default=None |
|
If not provided, labels will be inferred from y_true. If ``labels`` |
|
is ``None`` and ``y_pred`` has shape (n_samples,) the labels are |
|
assumed to be binary and are inferred from ``y_true``. |
|
|
|
Returns |
|
------- |
|
d2 : float or ndarray of floats |
|
The D^2 score. |
|
|
|
Notes |
|
----- |
|
This is not a symmetric function. |
|
|
|
Like R^2, D^2 score may be negative (it need not actually be the square of |
|
a quantity D). |
|
|
|
This metric is not well-defined for a single sample and will return a NaN |
|
value if n_samples is less than two. |
|
""" |
|
y_pred = check_array(y_pred, ensure_2d=False, dtype="numeric") |
|
check_consistent_length(y_pred, y_true, sample_weight) |
|
if _num_samples(y_pred) < 2: |
|
msg = "D^2 score is not well-defined with less than two samples." |
|
warnings.warn(msg, UndefinedMetricWarning) |
|
return float("nan") |
|
|
|
|
|
numerator = log_loss( |
|
y_true=y_true, |
|
y_pred=y_pred, |
|
normalize=False, |
|
sample_weight=sample_weight, |
|
labels=labels, |
|
) |
|
|
|
|
|
weights = _check_sample_weight(sample_weight, y_true) |
|
|
|
_, y_value_indices = np.unique(y_true, return_inverse=True) |
|
counts = np.bincount(y_value_indices, weights=weights) |
|
y_prob = counts / weights.sum() |
|
y_pred_null = np.tile(y_prob, (len(y_true), 1)) |
|
|
|
|
|
denominator = log_loss( |
|
y_true=y_true, |
|
y_pred=y_pred_null, |
|
normalize=False, |
|
sample_weight=sample_weight, |
|
labels=labels, |
|
) |
|
|
|
return 1 - (numerator / denominator) |
|
|