|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import numpy as np |
|
|
|
import warnings |
|
|
|
__all__ = [ |
|
'mean_average_precision', |
|
'ndcg_at', |
|
'precision_at', |
|
] |
|
def _require_positive_k(k): |
|
"""Helper function to avoid copy/pasted code for validating K""" |
|
if k <= 0: |
|
raise ValueError("ranking position k should be positive") |
|
|
|
|
|
def _mean_ranking_metric(predictions, labels, metric): |
|
"""Helper function for precision_at_k and mean_average_precision""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return np.mean([ |
|
metric(np.asarray(prd), np.asarray(labels[i])) |
|
for i, prd in enumerate(predictions) |
|
]) |
|
|
|
|
|
def _warn_for_empty_labels(): |
|
"""Helper for missing ground truth sets""" |
|
warnings.warn("Empty ground truth set! Check input data") |
|
return 0. |
|
|
|
|
|
def precision_at(predictions, labels, k=10, assume_unique=True): |
|
"""Compute the precision at K. |
|
Compute the average precision of all the queries, truncated at |
|
ranking position k. If for a query, the ranking algorithm returns |
|
n (n is less than k) results, the precision value will be computed |
|
as #(relevant items retrieved) / k. This formula also applies when |
|
the size of the ground truth set is less than k. |
|
If a query has an empty ground truth set, zero will be used as |
|
precision together with a warning. |
|
Parameters |
|
---------- |
|
predictions : array-like, shape=(n_predictions,) |
|
The prediction array. The items that were predicted, in descending |
|
order of relevance. |
|
labels : array-like, shape=(n_ratings,) |
|
The labels (positively-rated items). |
|
k : int, optional (default=10) |
|
The rank at which to measure the precision. |
|
assume_unique : bool, optional (default=True) |
|
Whether to assume the items in the labels and predictions are each |
|
unique. That is, the same item is not predicted multiple times or |
|
rated multiple times. |
|
Examples |
|
-------- |
|
>>> # predictions for 3 users |
|
>>> preds = [[1, 6, 2, 7, 8, 3, 9, 10, 4, 5], |
|
... [4, 1, 5, 6, 2, 7, 3, 8, 9, 10], |
|
... [1, 2, 3, 4, 5]] |
|
>>> # labels for the 3 users |
|
>>> labels = [[1, 2, 3, 4, 5], [1, 2, 3], []] |
|
>>> precision_at(preds, labels, 1) |
|
0.33333333333333331 |
|
>>> precision_at(preds, labels, 5) |
|
0.26666666666666666 |
|
>>> precision_at(preds, labels, 15) |
|
0.17777777777777778 |
|
""" |
|
|
|
_require_positive_k(k) |
|
|
|
def _inner_pk(pred, lab): |
|
|
|
|
|
|
|
if lab.shape[0] > 0: |
|
n = min(pred.shape[0], k) |
|
cnt = np.in1d(pred[:n], lab, assume_unique=assume_unique).sum() |
|
return float(cnt) / k |
|
else: |
|
return _warn_for_empty_labels() |
|
|
|
return _mean_ranking_metric(predictions, labels, _inner_pk) |
|
|
|
|
|
def mean_average_precision(predictions, labels, assume_unique=True): |
|
"""Compute the mean average precision on predictions and labels. |
|
Returns the mean average precision (MAP) of all the queries. If a query |
|
has an empty ground truth set, the average precision will be zero and a |
|
warning is generated. |
|
Parameters |
|
---------- |
|
predictions : array-like, shape=(n_predictions,) |
|
The prediction array. The items that were predicted, in descending |
|
order of relevance. |
|
labels : array-like, shape=(n_ratings,) |
|
The labels (positively-rated items). |
|
assume_unique : bool, optional (default=True) |
|
Whether to assume the items in the labels and predictions are each |
|
unique. That is, the same item is not predicted multiple times or |
|
rated multiple times. |
|
Examples |
|
-------- |
|
>>> # predictions for 3 users |
|
>>> preds = [[1, 6, 2, 7, 8, 3, 9, 10, 4, 5], |
|
... [4, 1, 5, 6, 2, 7, 3, 8, 9, 10], |
|
... [1, 2, 3, 4, 5]] |
|
>>> # labels for the 3 users |
|
>>> labels = [[1, 2, 3, 4, 5], [1, 2, 3], []] |
|
>>> mean_average_precision(preds, labels) |
|
0.35502645502645497 |
|
""" |
|
def _inner_map(pred, lab): |
|
if lab.shape[0]: |
|
|
|
|
|
|
|
n = pred.shape[0] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
arange = np.arange(n, dtype=np.float32) + 1. |
|
present = np.in1d(pred[:n], lab, assume_unique=assume_unique) |
|
prec_sum = np.ones(present.sum()).cumsum() |
|
denom = arange[present] |
|
return (prec_sum / denom).sum() / lab.shape[0] |
|
|
|
else: |
|
return _warn_for_empty_labels() |
|
|
|
return _mean_ranking_metric(predictions, labels, _inner_map) |
|
|
|
|
|
def ndcg_at(predictions, labels, k=10, assume_unique=True): |
|
"""Compute the normalized discounted cumulative gain at K. |
|
Compute the average NDCG value of all the queries, truncated at ranking |
|
position k. The discounted cumulative gain at position k is computed as: |
|
sum,,i=1,,^k^ (2^{relevance of ''i''th item}^ - 1) / log(i + 1) |
|
and the NDCG is obtained by dividing the DCG value on the ground truth set. |
|
In the current implementation, the relevance value is binary. |
|
If a query has an empty ground truth set, zero will be used as |
|
NDCG together with a warning. |
|
Parameters |
|
---------- |
|
predictions : array-like, shape=(n_predictions,) |
|
The prediction array. The items that were predicted, in descending |
|
order of relevance. |
|
labels : array-like, shape=(n_ratings,) |
|
The labels (positively-rated items). |
|
k : int, optional (default=10) |
|
The rank at which to measure the NDCG. |
|
assume_unique : bool, optional (default=True) |
|
Whether to assume the items in the labels and predictions are each |
|
unique. That is, the same item is not predicted multiple times or |
|
rated multiple times. |
|
Examples |
|
-------- |
|
>>> # predictions for 3 users |
|
>>> preds = [[1, 6, 2, 7, 8, 3, 9, 10, 4, 5], |
|
... [4, 1, 5, 6, 2, 7, 3, 8, 9, 10], |
|
... [1, 2, 3, 4, 5]] |
|
>>> # labels for the 3 users |
|
>>> labels = [[1, 2, 3, 4, 5], [1, 2, 3], []] |
|
>>> ndcg_at(preds, labels, 3) |
|
0.3333333432674408 |
|
>>> ndcg_at(preds, labels, 10) |
|
0.48791273434956867 |
|
References |
|
---------- |
|
.. [1] K. Jarvelin and J. Kekalainen, "IR evaluation methods for |
|
retrieving highly relevant documents." |
|
""" |
|
|
|
_require_positive_k(k) |
|
|
|
def _inner_ndcg(pred, lab): |
|
if lab.shape[0]: |
|
|
|
if not assume_unique: |
|
lab = np.unique(lab) |
|
|
|
n_lab = lab.shape[0] |
|
n_pred = pred.shape[0] |
|
n = min(max(n_pred, n_lab), k) |
|
|
|
|
|
|
|
|
|
arange = np.arange(n, dtype=np.float32) |
|
|
|
|
|
|
|
arange = arange[:n_pred] |
|
denom = np.log2(arange + 2.) |
|
gains = 1. / denom |
|
|
|
|
|
dcg_mask = np.in1d(pred[:n], lab, assume_unique=assume_unique) |
|
dcg = gains[dcg_mask].sum() |
|
|
|
|
|
max_dcg = gains[arange < n_lab].sum() |
|
return dcg / max_dcg |
|
|
|
else: |
|
return _warn_for_empty_labels() |
|
|
|
return _mean_ranking_metric(predictions, labels, _inner_ndcg) |