Spaces:
Running
Running
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""TODO: Add a description here.""" | |
from typing import List, Literal, Tuple | |
import datasets | |
import evaluate | |
import numpy as np | |
from deprecated import deprecated | |
from seametrics.detection import PrecisionRecallF1Support | |
from seametrics.detection.utils import payload_to_det_metric | |
from seametrics.payload import Payload | |
_CITATION = """\ | |
@InProceedings{coco:2020, | |
title = {Microsoft {COCO:} Common Objects in Context}, | |
authors={Tsung{-}Yi Lin and | |
Michael Maire and | |
Serge J. Belongie and | |
James Hays and | |
Pietro Perona and | |
Deva Ramanan and | |
Piotr Dollar and | |
C. Lawrence Zitnick}, | |
booktitle = {Computer Vision - {ECCV} 2014 - 13th European Conference, Zurich, | |
Switzerland, September 6-12, 2014, Proceedings, Part {V}}, | |
series = {Lecture Notes in Computer Science}, | |
volume = {8693}, | |
pages = {740--755}, | |
publisher = {Springer}, | |
year={2014} | |
} | |
""" | |
_DESCRIPTION = """\ | |
This evaluation metric is designed to give provide object detection metrics at | |
different object size levels. It is based on a modified version of the commonly used | |
COCO-evaluation metrics. | |
""" | |
_KWARGS_DESCRIPTION = """ | |
Calculates object detection metrics given predicted and ground truth bounding boxes for | |
a single image. | |
Args: | |
predictions: list of predictions for each image. Each prediction should | |
be a dict containing the following | |
- 'boxes': list of bounding boxes, xywh in absolute pixel values | |
- 'labels': list of labels for each bounding box | |
- 'scores': list of scores for each bounding box | |
references: list of ground truth annotations for each image. Each reference should | |
be a dict containing the following | |
- 'boxes': list of bounding boxes, xywh in absolute pixel values | |
- 'labels': list of labels for each bounding box | |
- 'area': list of areas for each bounding box | |
Returns: | |
dict containing dicts for each specified area range with following items: | |
'range': specified area with [max_px_area, max_px_area] | |
'iouThr': min. IOU-threshold of a prediction with a ground truth box | |
to be considered a correct prediction | |
'maxDets': maximum number of detections | |
'tp': number of true positive (correct) predictions | |
'fp': number of false positive (incorrect) predictions | |
'fn': number of false negative (missed) predictions | |
'duplicates': number of duplicate predictions | |
'precision': best possible score = 1, worst possible score = 0 | |
large if few false positive predictions | |
formula: tp/(fp+tp) | |
'recall' best possible score = 1, worst possible score = 0 | |
large if few missed predictions | |
formula: tp/(tp+fn) | |
'f1': best possible score = 1, worst possible score = 0 | |
trades off precision and recall | |
formula: 2*(precision*recall)/(precision+recall) | |
'support': number of ground truth bounding boxes considered in the evaluation, | |
'fpi': number of images with no ground truth but false positive predictions, | |
'nImgs': number of images considered in evaluation | |
Examples: | |
>>> import evaluate | |
>>> from seametrics.payload.processor import PayloadProcessor | |
>>> payload = PayloadProcessor(...).payload | |
>>> module = evaluate.load("SEA-AI/det-metrics", ...) | |
>>> module._add_payload(payload) | |
>>> result = module.compute() | |
>>> print(result) | |
{'all': { | |
'range': [0, 10000000000.0], | |
'iouThr': '0.00', | |
'maxDets': 100, | |
'tp': 1, | |
'fp': 3, | |
'fn': 1, | |
'duplicates': 0, | |
'precision': 0.25, | |
'recall': 0.5, | |
'f1': 0.3333333333333333, | |
'support': 2, | |
'fpi': 0, | |
'nImgs': 2 | |
} | |
} | |
""" | |
class DetectionMetric(evaluate.Metric): | |
def __init__( | |
self, | |
area_ranges_tuples: List[Tuple[str, List[int]]] = [("all", [0, 1e5**2])], | |
iou_threshold: List[float] = [1e-10], | |
class_agnostic: bool = True, | |
bbox_format: str = "xywh", | |
iou_type: Literal["bbox", "segm"] = "bbox", | |
payload: Payload = None, | |
**kwargs, | |
): | |
super().__init__(**kwargs) | |
# save parameters for later | |
self.payload = payload | |
self.model_names = payload.models if payload else ["custom"] | |
self.iou_thresholds = ( | |
iou_threshold if isinstance(iou_threshold, list) else [iou_threshold] | |
) | |
self.area_ranges = [v for _, v in area_ranges_tuples] | |
self.area_ranges_labels = [k for k, _ in area_ranges_tuples] | |
self.class_agnostic = class_agnostic | |
self.iou_type = iou_type | |
self.box_format = bbox_format | |
# initialize coco_metrics | |
self.coco_metric = PrecisionRecallF1Support( | |
iou_thresholds=self.iou_thresholds, | |
area_ranges=self.area_ranges, | |
area_ranges_labels=self.area_ranges_labels, | |
class_agnostic=self.class_agnostic, | |
iou_type=self.iou_type, | |
box_format=self.box_format, | |
) | |
# initialize evaluation metric | |
self._init_evaluation_metric() | |
def _info(self): | |
return evaluate.MetricInfo( | |
# This is the description that will appear on the modules page. | |
module_type="metric", | |
description=_DESCRIPTION, | |
citation=_CITATION, | |
inputs_description=_KWARGS_DESCRIPTION, | |
# This defines the format of each prediction and reference | |
features=datasets.Features( | |
{ | |
"predictions": [ | |
datasets.Features( | |
{ | |
"boxes": datasets.Sequence( | |
datasets.Sequence(datasets.Value("float")) | |
), | |
"labels": datasets.Sequence(datasets.Value("int64")), | |
"scores": datasets.Sequence(datasets.Value("float")), | |
} | |
) | |
], | |
"references": [ | |
datasets.Features( | |
{ | |
"boxes": datasets.Sequence( | |
datasets.Sequence(datasets.Value("float")) | |
), | |
"labels": datasets.Sequence(datasets.Value("int64")), | |
"area": datasets.Sequence(datasets.Value("float")), | |
} | |
) | |
], | |
} | |
), | |
# Additional links to the codebase or references | |
codebase_urls=[ | |
"https://github.com/SEA-AI/seametrics/tree/main", | |
"https://lightning.ai/docs/torchmetrics/stable/detection/mean_average_precision.html", | |
], | |
) | |
def add(self, *, prediction, reference, **kwargs): | |
"""Adds a batch of predictions and references to the metric""" | |
# in case the inputs are lists, convert them to numpy arrays | |
prediction = self._preprocess(prediction) | |
reference = self._preprocess(reference) | |
self.coco_metric.update(prediction, reference) | |
def _init_evaluation_metric(self, **kwargs): | |
""" | |
Initializes the evaluation metric by generating sample data, preprocessing predictions and references, | |
and then adding the processed data to the metric using the super class method with additional keyword arguments. | |
Parameters: | |
**kwargs: Additional keyword arguments for the super class method. | |
Returns: | |
None | |
""" | |
predictions, references = self._generate_sample_data() | |
predictions = self._preprocess(predictions) | |
references = self._preprocess(references) | |
# does not impact the metric, but is required for the interface x_x | |
super(evaluate.Metric, self).add( | |
prediction=self._postprocess(predictions), | |
references=self._postprocess(references), | |
**kwargs, | |
) | |
def add_batch(self, payload: Payload, model_name: str = None): | |
"""Takes as input a payload and adds the batch to the metric""" | |
self._add_payload(payload, model_name) | |
def _compute(self, *, predictions, references, **kwargs): | |
"""Called within the evaluate.Metric.compute() method""" | |
results = {} | |
for model_name in self.model_names: | |
print(f"\n##### {model_name} #####") | |
# add payload if available (otherwise predictions and references must be added with add function) | |
if self.payload: | |
self._add_payload(self.payload, model_name) | |
results[model_name] = self.coco_metric.compute() | |
# reset coco_metrics for next model | |
self.coco_metric = PrecisionRecallF1Support( | |
iou_thresholds=self.iou_thresholds, | |
area_ranges=self.area_ranges, | |
area_ranges_labels=self.area_ranges_labels, | |
class_agnostic=self.class_agnostic, | |
iou_type=self.iou_type, | |
box_format=self.box_format, | |
) | |
return results | |
def _add_payload(self, payload: Payload, model_name: str = None): | |
"""Converts the payload to the format expected by the metric""" | |
# import only if needed since fiftyone is not a direct dependency | |
predictions, references = payload_to_det_metric(payload, model_name) | |
self.add(prediction=predictions, reference=references) | |
return self | |
def _preprocess(self, list_of_dicts): | |
"""Converts the lists to numpy arrays for type checking""" | |
return [self._lists_to_np(d) for d in list_of_dicts] | |
def _postprocess(self, list_of_dicts): | |
"""Converts the numpy arrays to lists for type checking""" | |
return [self._np_to_lists(d) for d in list_of_dicts] | |
def _np_to_lists(self, d): | |
"""datasets does not support numpy arrays for type checking""" | |
for k, v in d.items(): | |
if isinstance(v, dict): | |
self._np_to_lists(v) | |
elif isinstance(v, np.ndarray): | |
d[k] = v.tolist() | |
return d | |
def _lists_to_np(self, d): | |
"""datasets does not support numpy arrays for type checking""" | |
for k, v in d.items(): | |
if isinstance(v, dict): | |
self._lists_to_np(v) | |
elif isinstance(v, list): | |
d[k] = np.array(v) | |
return d | |
def generate_confidence_curves( | |
self, results, confidence_config={"T": 0, "R": 0, "K": 0, "A": 0, "M": 0} | |
): | |
""" | |
Generate confidence curves based on results and confidence configuration. | |
Parameters: | |
results (dict): Results of the evaluation for different models. | |
confidence_config (dict): Configuration for confidence values. Defaults to {"T": 0, "R": 0, "K": 0, "A": 0, "M": 0}. | |
T: [1e-10] iou threshold | |
R: recall threshold (not used) | |
K: class index (class-agnostic mAP, so only 0) | |
A: 0=all, 1=small, 2=medium, 3=large, ... (depending on area ranges) | |
M: [100] maxDets default in precision_recall_f1_support | |
Returns: | |
fig (plotly.graph_objects.Figure): The plotly figure showing the confidence curves. | |
""" | |
import plotly.graph_objects as go | |
from seametrics.detection.utils import get_confidence_metric_vals | |
# Create traces | |
fig = go.Figure() | |
metrics = ["precision", "recall", "f1"] | |
for model_name in self.model_names: | |
print(f"##### {model_name} #####") | |
plot_data = get_confidence_metric_vals( | |
cocoeval=results[model_name]["eval"], | |
T=confidence_config["T"], | |
R=confidence_config["R"], | |
K=confidence_config["K"], | |
A=confidence_config["A"], | |
M=confidence_config["M"], | |
) | |
for metric in metrics: | |
fig.add_trace( | |
go.Scatter( | |
x=plot_data["conf"], | |
y=plot_data[metric], | |
mode="lines", | |
name=f"{model_name} {metric}", | |
line=dict(dash=None if metric == "f1" else "dash"), | |
) | |
) | |
fig.update_layout( | |
title="Metric vs Confidence", | |
hovermode="x unified", | |
xaxis_title="Confidence", | |
yaxis_title="Metric value", | |
) | |
return fig | |
def wandb(self, results , wandb_runs: list = None, wandb_section: str = None, wandb_project='detection_metrics'): | |
""" | |
Logs metrics to Weights and Biases (wandb) for tracking and visualization. | |
This function logs the provided metrics to Weights and Biases (wandb), a platform for tracking machine learning experiments. | |
Each key in the `results` dictionary represents a separate run and the corresponding value contains the metrics for that run. | |
If a W&B run list is provided, the results of the runs will be added to the passed W&B runs. Otherwise new W&B runs will be created. | |
If a W&B section ist provided, the metrics will be logged in this section drop-down. Otherwise no extra W&B section is created | |
and the metrics are logged directly. | |
The function logs in to wandb using an API key obtained from the secret 'WANDB_API_KEY', initializes a run for | |
each key in `results` and logs the metrics. | |
Args: | |
results (dict): A dictionary where each key is a unique identifier for a run and each value is another dictionary | |
containing the metrics to log. Example: | |
{ | |
"run1": {"metrics": {"accuracy": 0.9, "loss": 0.1}}, | |
"run2": {"metrics": {"accuracy": 0.85, "loss": 0.15}} | |
} | |
wandb_runs (list, optional): A list containing W&B runs where the results should be added | |
(e.g. the first item in results will be added to the first run in wandb_runs, etc.) | |
wandb_section (str, optional): A string to specify the W&B | |
wandb_project (str, optional): The name of the wandb project to which the runs will be logged. Defaults to 'detection_metrics'. | |
Environment Variables: | |
WANDB_API_KEY: The API key for authenticating with wandb. | |
Imports: | |
os: To retrieve environment variables. | |
wandb: To interact with the Weights and Biases platform. | |
datetime: To generate a timestamp for run names. | |
""" | |
import os | |
import wandb | |
import datetime | |
current_datetime = datetime.datetime.now() | |
formatted_datetime = current_datetime.strftime("%Y-%m-%d_%H-%M-%S") | |
wandb.login(key=os.getenv('WANDB_API_KEY')) | |
if not wandb_runs is None: | |
assert len(wandb_runs) == len(results), "runs and results must have the same length" | |
for i, k in enumerate(results.keys()): | |
if wandb_runs is None: | |
run = wandb.init(project=wandb_project, name=f"{k}-{formatted_datetime}") | |
else: | |
run = wandb_runs[i] | |
run.log({f"{wandb_section}/{m}" : v for m, v in results[k]['metrics'].items()} if wandb_section is not None else results[k]['metrics']) | |
if wandb_runs is None: | |
run.finish() | |
def _generate_sample_data(self): | |
""" | |
Generates dummy sample data for predictions and references used for initialization. | |
Returns: | |
Tuple[List[Dict[str, List[Union[float, int]]]], List[Dict[str, List[Union[float, int]]]]]: | |
- predictions (List[Dict[str, List[Union[float, int]]]]): A list of dictionaries representing the predictions. Each dictionary contains the following keys: | |
- boxes (List[List[float]]): A list of bounding boxes in the format [x, y, w, h]. | |
- labels (List[int]): A list of labels. | |
- scores (List[float]): A list of scores. | |
- references (List[Dict[str, List[Union[float, int]]]]): A list of dictionaries representing the references. Each dictionary contains the following keys: | |
- boxes (List[List[float]]): A list of bounding boxes in the format [x, y, w, h]. | |
- labels (List[int]): A list of labels. | |
- area (List[float]): A list of areas. | |
""" | |
predictions = [ | |
{"boxes": [[1.0, 2.0, 3.0, 4.0]], "labels": [0], "scores": [1.0]} | |
] | |
references = [{"boxes": [[1.0, 2.0, 3.0, 4.0]], "labels": [0], "area": [1.0]}] | |
return predictions, references | |