|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import evaluate |
|
import datasets |
|
|
|
|
|
from .FairEvalUtils import * |
|
|
|
|
|
import importlib |
|
from typing import List, Optional, Union |
|
from seqeval.metrics.v1 import check_consistent_length |
|
from seqeval.scheme import Entities, Token, auto_detect |
|
|
|
_CITATION = """\ |
|
@inproceedings{ortmann2022, |
|
title = {Fine-Grained Error Analysis and Fair Evaluation of Labeled Spans}, |
|
author = {Katrin Ortmann}, |
|
url = {https://aclanthology.org/2022.lrec-1.150}, |
|
year = {2022}, |
|
date = {2022-06-21}, |
|
booktitle = {Proceedings of the Language Resources and Evaluation Conference (LREC)}, |
|
pages = {1400-1407}, |
|
publisher = {European Language Resources Association}, |
|
address = {Marseille, France}, |
|
pubstate = {published}, |
|
type = {inproceedings} |
|
} |
|
""" |
|
|
|
_DESCRIPTION = """\ |
|
New evaluation method that more accurately reflects true annotation quality by ensuring that every error is counted |
|
only once - avoiding the penalty to close-to-target annotations happening in traditional evaluation. |
|
In addition to the traditional categories of true positives (TP), false positives (FP), and false negatives |
|
(FN), the new method takes into account the more fine-grained error types suggested by Manning: labeling errors (LE), |
|
boundary errors (BE), and labeling-boundary errors (LBE). Additionally, the system also distinguishes different types |
|
of boundary errors: BES (the system's annotation is smaller than the target span), BEL (the system's annotation is |
|
larger than the target span) and BEO (the system span overlaps with the target span) |
|
""" |
|
|
|
_KWARGS_DESCRIPTION = """ |
|
Counts the number of redefined traditional errors (FP, FN), newly defined errors (BE, LE, LBE) and fine-grained |
|
boundary errors (BES, BEL, BEO). Then computes the fair Precision, Recall and F1-Score. |
|
For the computation of the metrics from the error count please refer to: https://aclanthology.org/2022.lrec-1.150.pdf |
|
Args: |
|
predictions: list of predictions to score. Each predicted sentence |
|
should be a list of IOB-formatted labels corresponding to each sentence token. |
|
Predicted sentences must have the same number of tokens as the references'. |
|
references: list of reference for each prediction. Each reference sentence |
|
should be a list of IOB-formatted labels corresponding to each sentence token. |
|
Returns: |
|
A dictionary with: |
|
TP: count of True Positives |
|
FP: count of False Positives |
|
FN: count of False Negatives |
|
LE: count of Labeling Errors |
|
BE: count of Boundary Errors |
|
BEO: segment of the BE where the prediction overlaps with the reference |
|
BES: segment of the BE where the prediction is smaller than the reference |
|
BEL: segment of the BE where the prediction is larger than the reference |
|
LBE : count of Label-and-Boundary Errors |
|
Prec: fair precision |
|
Rec: fair recall |
|
F1: fair F1-score |
|
Examples: |
|
>>> faireval = evaluate.load("illorca/fairevaluation") |
|
>>> pred = ['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER', 'O'] |
|
>>> ref = ['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER', 'O'] |
|
>>> results = faireval.compute(predictions=pred, references=ref) |
|
>>> print(results) |
|
{'TP': 1, |
|
'FP': 0, |
|
'FN': 0, |
|
'LE': 0, |
|
'BE': 1, |
|
'BEO': 0, |
|
'BES': 0, |
|
'BEL': 1, |
|
'LBE': 0, |
|
'Prec': 0.6666666666666666, |
|
'Rec': 0.6666666666666666, |
|
'F1': 0.6666666666666666} |
|
""" |
|
|
|
|
|
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) |
|
class FairEvaluation(evaluate.Metric): |
|
"""Counts the number of redefined traditional errors (FP, FN), newly defined errors (BE, LE, LBE) and fine-grained |
|
boundary errors (BES, BEL, BEO). Then computes the fair Precision, Recall and F1-Score. """ |
|
|
|
def _info(self): |
|
return evaluate.MetricInfo( |
|
|
|
module_type="metric", |
|
description=_DESCRIPTION, |
|
citation=_CITATION, |
|
inputs_description=_KWARGS_DESCRIPTION, |
|
|
|
features=datasets.Features({ |
|
"predictions": datasets.Sequence(datasets.Value("string", id="label"), id="sequence"), |
|
"references": datasets.Sequence(datasets.Value("string", id="label"), id="sequence"), |
|
}), |
|
|
|
homepage="https://huggingface.co/spaces/illorca/fairevaluation", |
|
|
|
codebase_urls=["https://github.com/rubcompling/FairEval#acknowledgement"], |
|
reference_urls=["https://aclanthology.org/2022.lrec-1.150.pdf"] |
|
) |
|
|
|
def _compute( |
|
self, |
|
predictions, |
|
references, |
|
suffix: bool = False, |
|
scheme: Optional[str] = None, |
|
mode: Optional[str] = 'fair', |
|
error_format: Optional[str] = 'count', |
|
sample_weight: Optional[List[int]] = None, |
|
zero_division: Union[str, int] = "warn", |
|
): |
|
"""Returns the error counts and fair scores""" |
|
|
|
if scheme is not None: |
|
try: |
|
scheme_module = importlib.import_module("seqeval.scheme") |
|
scheme = getattr(scheme_module, scheme) |
|
except AttributeError: |
|
raise ValueError(f"Scheme should be one of [IOB1, IOB2, IOE1, IOE2, IOBES, BILOU], got {scheme}") |
|
|
|
y_true = references |
|
y_pred = predictions |
|
|
|
check_consistent_length(y_true, y_pred) |
|
|
|
if scheme is None or not issubclass(scheme, Token): |
|
scheme = auto_detect(y_true, suffix) |
|
|
|
true_spans = Entities(y_true, scheme, suffix).entities |
|
pred_spans = Entities(y_pred, scheme, suffix).entities |
|
|
|
|
|
true_spans = seq_to_fair(true_spans) |
|
pred_spans = seq_to_fair(pred_spans) |
|
|
|
|
|
total_errors = compare_spans([], []) |
|
|
|
for i in range(len(true_spans)): |
|
sentence_errors = compare_spans(true_spans[i], pred_spans[i]) |
|
total_errors = add_dict(total_errors, sentence_errors) |
|
|
|
results = calculate_results(total_errors) |
|
del results['conf'] |
|
|
|
|
|
output = {} |
|
total_trad_errors = results['overall']['traditional']['FP'] + results['overall']['traditional']['FN'] |
|
total_fair_errors = results['overall']['fair']['FP'] + results['overall']['fair']['FN'] + \ |
|
results['overall']['fair']['LE'] + results['overall']['fair']['BE'] + \ |
|
results['overall']['fair']['LBE'] |
|
|
|
assert mode in ['traditional', 'fair'], 'mode must be \'traditional\' or \'fair\'' |
|
assert error_format in ['count', 'proportion'], 'error_format must be \'count\' or \'proportion\'' |
|
|
|
if mode == 'traditional': |
|
for k, v in results['per_label'][mode].items(): |
|
if error_format == 'count': |
|
output[k] = {'precision': v['Prec'], 'recall': v['Rec'], 'f1': v['F1'], 'TP': v['TP'], |
|
'FP': v['FP'], 'FN': v['FN']} |
|
elif error_format == 'proportion': |
|
output[k] = {'precision': v['Prec'], 'recall': v['Rec'], 'f1': v['F1'], 'TP': v['TP'], |
|
'FP': v['FP'] / total_trad_errors, 'FN': v['FN'] / total_trad_errors} |
|
elif mode == 'fair': |
|
for k, v in results['per_label'][mode].items(): |
|
if error_format == 'count': |
|
output[k] = {'precision': v['Prec'], 'recall': v['Rec'], 'f1': v['F1'], 'TP': v['TP'], |
|
'FP': v['FP'], 'FN': v['FN'], 'LE': v['LE'], 'BE': v['BE'], 'LBE': v['LBE']} |
|
elif error_format == 'proportion': |
|
output[k] = {'precision': v['Prec'], 'recall': v['Rec'], 'f1': v['F1'], 'TP': v['TP'], |
|
'FP': v['FP'] / total_fair_errors, 'FN': v['FN'] / total_fair_errors, |
|
'LE': v['LE'] / total_fair_errors, 'BE': v['BE'] / total_fair_errors, |
|
'LBE': v['LBE'] / total_fair_errors} |
|
|
|
output['overall_precision'] = results['overall'][mode]['Prec'] |
|
output['overall_recall'] = results['overall'][mode]['Rec'] |
|
output['overall_f1'] = results['overall'][mode]['F1'] |
|
|
|
if mode == 'traditional': |
|
output['TP'] = results['overall'][mode]['TP'] |
|
output['FP'] = results['overall'][mode]['FP'] |
|
output['FN'] = results['overall'][mode]['FN'] |
|
if error_format == 'proportion': |
|
output['FP'] = output['FP'] / total_trad_errors |
|
output['FN'] = output['FN'] / total_trad_errors |
|
elif mode == 'fair': |
|
output['TP'] = results['overall'][mode]['TP'] |
|
output['FP'] = results['overall'][mode]['FP'] |
|
output['FN'] = results['overall'][mode]['FN'] |
|
output['LE'] = results['overall'][mode]['LE'] |
|
output['BE'] = results['overall'][mode]['BE'] |
|
output['LBE'] = results['overall'][mode]['LBE'] |
|
if error_format == 'proportion': |
|
output['FP'] = output['FP'] / total_fair_errors |
|
output['FN'] = output['FN'] / total_fair_errors |
|
output['LE'] = output['LE'] / total_fair_errors |
|
output['BE'] = output['BE'] / total_fair_errors |
|
output['LBE'] = output['LBE'] / total_fair_errors |
|
|
|
return output |
|
|
|
|
|
def seq_to_fair(seq_sentences): |
|
out = [] |
|
for seq_sentence in seq_sentences: |
|
sentence = [] |
|
for entity in seq_sentence: |
|
span = str(entity).replace('(', '').replace(')', '').replace(' ', '').split(',') |
|
span = span[1:] |
|
span[-1] = int(span[-1]) - 1 |
|
span[1] = int(span[1]) |
|
span.append({i for i in range(span[1], span[2] + 1)}) |
|
sentence.append(span) |
|
out.append(sentence) |
|
return out |
|
|