|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Mean average precision metric""" |
|
|
|
import evaluate |
|
import datasets |
|
import json |
|
from ranx import Qrels, Run |
|
from ranx import evaluate as ran_evaluate |
|
|
|
|
|
_CITATION = """\ |
|
@inproceedings{ranx, |
|
author = {Elias Bassani}, |
|
title = {ranx: {A} Blazing-Fast Python Library for Ranking Evaluation and Comparison}, |
|
booktitle = {{ECIR} {(2)}}, |
|
series = {Lecture Notes in Computer Science}, |
|
volume = {13186}, |
|
pages = {259--264}, |
|
publisher = {Springer}, |
|
year = {2022}, |
|
doi = {10.1007/978-3-030-99739-7\_30} |
|
} |
|
""" |
|
|
|
_DESCRIPTION = """\ |
|
This is the mean average precision (map) metric for retrieval systems. |
|
It is the average of the precision scores computer after each relevant document is got. You can refer to [here](https://amenra.github.io/ranx/metrics/#mean-average-precision) |
|
""" |
|
|
|
|
|
_KWARGS_DESCRIPTION = """ |
|
Args: |
|
predictions: dictionary of dictionaries where each dictionary consists of document relevancy scores produced by the model for a given query |
|
One dictionary per query. |
|
references: List of list of strings where each lists consists of the relevant document names for a given query in a sorted relevancy order. |
|
The outer list is sorted from query one to n. |
|
k: `int`, optional, default is None, it is to calculate map@k |
|
Returns: |
|
map (`float`): mean average precision score. Minimum possible value is 0. Maximum possible value is 1.0 |
|
Examples: |
|
|
|
>>> my_new_module = evaluate.load("map") |
|
>>> references= [json.dumps({"q_1":{"d_1":1, "d_2":2} }), |
|
json.dumps({"q_2":{"d_2":1, "d_3":2, "d_5":3}})] |
|
>>> predictions = [json.dumps({"q_1": { "d_1": 0.8, "d_2": 0.9}}), |
|
json.dumps({"q_2": {"d_2": 0.9, "d_1": 0.8, "d_5": 0.7, "d_3": 0.3}})] |
|
>>> results = my_new_module.compute(references=references, predictions=predictions) |
|
>>> print(results) |
|
{'recall': 1.0} |
|
""" |
|
|
|
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) |
|
class map(evaluate.Metric): |
|
def _info(self): |
|
return evaluate.MetricInfo( |
|
|
|
module_type="metric", |
|
description=_DESCRIPTION, |
|
citation=_CITATION, |
|
inputs_description=_KWARGS_DESCRIPTION, |
|
|
|
features=datasets.Features({ |
|
'predictions': datasets.Value("string"), |
|
'references': datasets.Value("string") |
|
}), |
|
|
|
reference_urls=["https://amenra.github.io/ranx/"] |
|
) |
|
|
|
def _compute(self, predictions, references, k=None): |
|
"""Returns the scores""" |
|
preds = {} |
|
refs = {} |
|
for pred in predictions: |
|
preds = preds | json.loads(pred) |
|
for ref in references: |
|
refs = refs | json.loads(ref) |
|
|
|
run = Run(preds) |
|
qrels = Qrels(refs) |
|
metric = "map" if k is None else f"map@{k}" |
|
map_score = ran_evaluate(qrels, run, metric) |
|
return { |
|
"map": map_score, |
|
} |