|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Mean average precision metric""" |
|
|
|
import evaluate |
|
import datasets |
|
import json |
|
from ranx import Qrels, Run |
|
from ranx import evaluate as ran_evaluate |
|
|
|
|
|
_CITATION = """\ |
|
@inproceedings{ranx, |
|
author = {Elias Bassani}, |
|
title = {ranx: {A} Blazing-Fast Python Library for Ranking Evaluation and Comparison}, |
|
booktitle = {{ECIR} {(2)}}, |
|
series = {Lecture Notes in Computer Science}, |
|
volume = {13186}, |
|
pages = {259--264}, |
|
publisher = {Springer}, |
|
year = {2022}, |
|
doi = {10.1007/978-3-030-99739-7\_30} |
|
} |
|
""" |
|
|
|
_DESCRIPTION = """\ |
|
This is the mean average precision (map) metric for retrieval systems. |
|
It is the average of the precision scores computer after each relevant document is got. You can refer to [here](https://amenra.github.io/ranx/metrics/#mean-average-precision) |
|
""" |
|
|
|
|
|
_KWARGS_DESCRIPTION = """ |
|
Args: |
|
predictions: dictionary of dictionaries where each dictionary consists of document relevancy scores produced by the model for a given query |
|
One dictionary per query. |
|
references: List of list of strings where each lists consists of the relevant document names for a given query in a sorted relevancy order. |
|
The outer list is sorted from query one to n. |
|
Returns: |
|
map (`float`): mean average precision score. Minimum possible value is 0. Maximum possible value is 1.0 |
|
Examples: |
|
|
|
>>> my_new_module = evaluate.load("map") |
|
>>> results = my_new_module.compute( |
|
references=[ |
|
["d_1", "d_2"], |
|
["d_2", "d_3", "d_5"] |
|
] |
|
predictions={ |
|
"q_1": { "d_1": 0.9, "d_2": 0.8, }, |
|
"q_2": { "d_2": 0.9, "d_1": 0.8, "d_5": 0.7, "d_3": 0.3} } |
|
) |
|
>>> print(results) |
|
{'map': 0.902777} |
|
""" |
|
|
|
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) |
|
class map(evaluate.Metric): |
|
def _info(self): |
|
return evaluate.MetricInfo( |
|
|
|
module_type="metric", |
|
description=_DESCRIPTION, |
|
citation=_CITATION, |
|
inputs_description=_KWARGS_DESCRIPTION, |
|
|
|
features=datasets.Features({ |
|
'predictions': datasets.Value("string"), |
|
'references': datasets.Value("string") |
|
}), |
|
|
|
reference_urls=["https://amenra.github.io/ranx/"] |
|
) |
|
|
|
def _compute(self, predictions, references): |
|
"""Returns the scores""" |
|
preds = {} |
|
refs = {} |
|
for pred in predictions: |
|
preds = preds | json.loads(pred) |
|
for ref in references: |
|
refs = refs | json.loads(ref) |
|
|
|
run = Run(preds) |
|
"""gt_dict = {} |
|
for i in range(len(references)): |
|
per_query_gt = {} |
|
for rank in range(len(references[i])): |
|
per_query_gt[references[i][rank]] = rank+1 |
|
gt_dict[f"q_{i+1}"] = per_query_gt""" |
|
qrels = Qrels(refs) |
|
map_score = ran_evaluate(qrels, run, "map") |
|
return { |
|
"map": map_score, |
|
} |