rnn-embeddings-songs / project /evaluation /ranking_metrics.py

Igor Santana

rnn model sent from github to huggingface

9c58361 about 2 years ago

9.34 kB

	# -- coding: utf-8 --
	#
	# Author: Taylor G Smith
	#
	# Recommender system ranking metrics derived from Spark source for use with
	# Python-based recommender libraries (i.e., implicit,
	# http://github.com/benfred/implicit/). These metrics are derived from the
	# original Spark Scala source code for recommender metrics.
	# https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala

	import numpy as np

	import warnings

	__all__ = [
	'mean_average_precision',
	'ndcg_at',
	'precision_at',
	]
	def _require_positive_k(k):
	"""Helper function to avoid copy/pasted code for validating K"""
	if k <= 0:
	raise ValueError("ranking position k should be positive")


	def _mean_ranking_metric(predictions, labels, metric):
	"""Helper function for precision_at_k and mean_average_precision"""
	# do not zip, as this will require an extra pass of O(N). Just assert
	# equal length and index (compute in ONE pass of O(N)).
	# if len(predictions) != len(labels):
	# raise ValueError("dim mismatch in predictions and labels!")
	# return np.mean([
	# metric(np.asarray(predictions[i]), np.asarray(labels[i]))
	# for i in xrange(len(predictions))
	# ])

	# Actually probably want lazy evaluation in case preds is a
	# generator, since preds can be very dense and could blow up
	# memory... but how to assert lengths equal? FIXME
	return np.mean([
	metric(np.asarray(prd), np.asarray(labels[i]))
	for i, prd in enumerate(predictions) # lazy eval if generator
	])


	def _warn_for_empty_labels():
	"""Helper for missing ground truth sets"""
	warnings.warn("Empty ground truth set! Check input data")
	return 0.


	def precision_at(predictions, labels, k=10, assume_unique=True):
	"""Compute the precision at K.
	Compute the average precision of all the queries, truncated at
	ranking position k. If for a query, the ranking algorithm returns
	n (n is less than k) results, the precision value will be computed
	as #(relevant items retrieved) / k. This formula also applies when
	the size of the ground truth set is less than k.
	If a query has an empty ground truth set, zero will be used as
	precision together with a warning.
	Parameters
	----------
	predictions : array-like, shape=(n_predictions,)
	The prediction array. The items that were predicted, in descending
	order of relevance.
	labels : array-like, shape=(n_ratings,)
	The labels (positively-rated items).
	k : int, optional (default=10)
	The rank at which to measure the precision.
	assume_unique : bool, optional (default=True)
	Whether to assume the items in the labels and predictions are each
	unique. That is, the same item is not predicted multiple times or
	rated multiple times.
	Examples
	--------
	>>> # predictions for 3 users
	>>> preds = [[1, 6, 2, 7, 8, 3, 9, 10, 4, 5],
	... [4, 1, 5, 6, 2, 7, 3, 8, 9, 10],
	... [1, 2, 3, 4, 5]]
	>>> # labels for the 3 users
	>>> labels = [[1, 2, 3, 4, 5], [1, 2, 3], []]
	>>> precision_at(preds, labels, 1)
	0.33333333333333331
	>>> precision_at(preds, labels, 5)
	0.26666666666666666
	>>> precision_at(preds, labels, 15)
	0.17777777777777778
	"""
	# validate K
	_require_positive_k(k)

	def _inner_pk(pred, lab):
	# need to compute the count of the number of values in the predictions
	# that are present in the labels. We'll use numpy in1d for this (set
	# intersection in O(1))
	if lab.shape[0] > 0:
	n = min(pred.shape[0], k)
	cnt = np.in1d(pred[:n], lab, assume_unique=assume_unique).sum()
	return float(cnt) / k
	else:
	return _warn_for_empty_labels()

	return _mean_ranking_metric(predictions, labels, _inner_pk)


	def mean_average_precision(predictions, labels, assume_unique=True):
	"""Compute the mean average precision on predictions and labels.
	Returns the mean average precision (MAP) of all the queries. If a query
	has an empty ground truth set, the average precision will be zero and a
	warning is generated.
	Parameters
	----------
	predictions : array-like, shape=(n_predictions,)
	The prediction array. The items that were predicted, in descending
	order of relevance.
	labels : array-like, shape=(n_ratings,)
	The labels (positively-rated items).
	assume_unique : bool, optional (default=True)
	Whether to assume the items in the labels and predictions are each
	unique. That is, the same item is not predicted multiple times or
	rated multiple times.
	Examples
	--------
	>>> # predictions for 3 users
	>>> preds = [[1, 6, 2, 7, 8, 3, 9, 10, 4, 5],
	... [4, 1, 5, 6, 2, 7, 3, 8, 9, 10],
	... [1, 2, 3, 4, 5]]
	>>> # labels for the 3 users
	>>> labels = [[1, 2, 3, 4, 5], [1, 2, 3], []]
	>>> mean_average_precision(preds, labels)
	0.35502645502645497
	"""
	def _inner_map(pred, lab):
	if lab.shape[0]:
	# compute the number of elements within the predictions that are
	# present in the actual labels, and get the cumulative sum weighted
	# by the index of the ranking
	n = pred.shape[0]

	# Scala code from Spark source:
	# var i = 0
	# var cnt = 0
	# var precSum = 0.0
	# val n = pred.length
	# while (i < n) {
	# if (labSet.contains(pred(i))) {
	# cnt += 1
	# precSum += cnt.toDouble / (i + 1)
	# }
	# i += 1
	# }
	# precSum / labSet.size

	arange = np.arange(n, dtype=np.float32) + 1. # this is the denom
	present = np.in1d(pred[:n], lab, assume_unique=assume_unique)
	prec_sum = np.ones(present.sum()).cumsum()
	denom = arange[present]
	return (prec_sum / denom).sum() / lab.shape[0]

	else:
	return _warn_for_empty_labels()

	return _mean_ranking_metric(predictions, labels, _inner_map)


	def ndcg_at(predictions, labels, k=10, assume_unique=True):
	"""Compute the normalized discounted cumulative gain at K.
	Compute the average NDCG value of all the queries, truncated at ranking
	position k. The discounted cumulative gain at position k is computed as:
	sum,,i=1,,^k^ (2^{relevance of ''i''th item}^ - 1) / log(i + 1)
	and the NDCG is obtained by dividing the DCG value on the ground truth set.
	In the current implementation, the relevance value is binary.
	If a query has an empty ground truth set, zero will be used as
	NDCG together with a warning.
	Parameters
	----------
	predictions : array-like, shape=(n_predictions,)
	The prediction array. The items that were predicted, in descending
	order of relevance.
	labels : array-like, shape=(n_ratings,)
	The labels (positively-rated items).
	k : int, optional (default=10)
	The rank at which to measure the NDCG.
	assume_unique : bool, optional (default=True)
	Whether to assume the items in the labels and predictions are each
	unique. That is, the same item is not predicted multiple times or
	rated multiple times.
	Examples
	--------
	>>> # predictions for 3 users
	>>> preds = [[1, 6, 2, 7, 8, 3, 9, 10, 4, 5],
	... [4, 1, 5, 6, 2, 7, 3, 8, 9, 10],
	... [1, 2, 3, 4, 5]]
	>>> # labels for the 3 users
	>>> labels = [[1, 2, 3, 4, 5], [1, 2, 3], []]
	>>> ndcg_at(preds, labels, 3)
	0.3333333432674408
	>>> ndcg_at(preds, labels, 10)
	0.48791273434956867
	References
	----------
	.. [1] K. Jarvelin and J. Kekalainen, "IR evaluation methods for
	retrieving highly relevant documents."
	"""
	# validate K
	_require_positive_k(k)

	def _inner_ndcg(pred, lab):
	if lab.shape[0]:
	# if we do NOT assume uniqueness, the set is a bit different here
	if not assume_unique:
	lab = np.unique(lab)

	n_lab = lab.shape[0]
	n_pred = pred.shape[0]
	n = min(max(n_pred, n_lab), k) # min(min(p, l), k)?

	# similar to mean_avg_prcsn, we need an arange, but this time +2
	# since python is zero-indexed, and the denom typically needs +1.
	# Also need the log base2...
	arange = np.arange(n, dtype=np.float32) # length n

	# since we are only interested in the arange up to n_pred, truncate
	# if necessary
	arange = arange[:n_pred]
	denom = np.log2(arange + 2.) # length n
	gains = 1. / denom # length n

	# compute the gains where the prediction is present in the labels
	dcg_mask = np.in1d(pred[:n], lab, assume_unique=assume_unique)
	dcg = gains[dcg_mask].sum()

	# the max DCG is sum of gains where the index < the label set size
	max_dcg = gains[arange < n_lab].sum()
	return dcg / max_dcg

	else:
	return _warn_for_empty_labels()

	return _mean_ranking_metric(predictions, labels, _inner_ndcg)