Spaces:
Sleeping
Sleeping
# Copyright 2020 The HuggingFace Evaluate Authors. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
""" seqeval metric. """ | |
import importlib | |
from typing import List, Optional, Union | |
import datasets | |
from seqeval_with_fbetal.metrics import accuracy_score, classification_report | |
import evaluate | |
_CITATION = """\ | |
@inproceedings{ramshaw-marcus-1995-text, | |
title = "Text Chunking using Transformation-Based Learning", | |
author = "Ramshaw, Lance and | |
Marcus, Mitch", | |
booktitle = "Third Workshop on Very Large Corpora", | |
year = "1995", | |
url = "https://www.aclweb.org/anthology/W95-0107", | |
} | |
@misc{seqeval, | |
title={{seqeval}: A Python framework for sequence labeling evaluation}, | |
url={https://github.com/chakki-works/seqeval}, | |
note={Software available from https://github.com/chakki-works/seqeval}, | |
author={Hiroki Nakayama}, | |
year={2018}, | |
} | |
""" | |
_DESCRIPTION = """\ | |
seqeval is a Python framework for sequence labeling evaluation. | |
seqeval can evaluate the performance of chunking tasks such as named-entity recognition, part-of-speech tagging, semantic role labeling and so on. | |
This is well-tested by using the Perl script conlleval, which can be used for | |
measuring the performance of a system that has processed the CoNLL-2000 shared task data. | |
seqeval supports following formats: | |
IOB1 | |
IOB2 | |
IOE1 | |
IOE2 | |
IOBES | |
See the [README.md] file at https://github.com/chakki-works/seqeval for more information. | |
""" | |
_KWARGS_DESCRIPTION = """ | |
Produces labelling scores along with its sufficient statistics | |
from a source against one or more references. | |
Args: | |
predictions: List of List of predicted labels (Estimated targets as returned by a tagger) | |
references: List of List of reference labels (Ground truth (correct) target values) | |
beta: Weight for the F-score | |
suffix: True if the IOB prefix is after type, False otherwise. default: False | |
scheme: Specify target tagging scheme. Should be one of ["IOB1", "IOB2", "IOE1", "IOE2", "IOBES", "BILOU"]. | |
default: None | |
mode: Whether to count correct entity labels with incorrect I/B tags as true positives or not. | |
If you want to only count exact matches, pass mode="strict". default: None. | |
sample_weight: Array-like of shape (n_samples,), weights for individual samples. default: None | |
zero_division: Which value to substitute as a metric value when encountering zero division. Should be on of 0, 1, | |
"warn". "warn" acts as 0, but the warning is raised. | |
Returns: | |
'scores': dict. Summary of the scores for overall and per type | |
Overall: | |
'accuracy': accuracy, | |
'precision': precision, | |
'recall': recall, | |
'f1': F1 score, also known as balanced F-score or F-measure, | |
'fbeta': F-score with weight beta | |
Per type: | |
'precision': precision, | |
'recall': recall, | |
'f1': F1 score, also known as balanced F-score or F-measure, | |
'fbeta': F-score with weight beta | |
Examples: | |
>>> predictions = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] | |
>>> references = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] | |
>>> seqeval = evaluate.load("seqeval") | |
>>> results = seqeval.compute(predictions=predictions, references=references, beta=1.0) | |
>>> print(list(results.keys())) | |
['MISC', 'PER', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'] | |
>>> print(results["overall_f1"]) | |
0.5 | |
>>> print(results["PER"]["f1"]) | |
1.0 | |
""" | |
class Seqeval(evaluate.Metric): | |
def _info(self): | |
return evaluate.MetricInfo( | |
description=_DESCRIPTION, | |
citation=_CITATION, | |
homepage="https://github.com/chakki-works/seqeval", | |
inputs_description=_KWARGS_DESCRIPTION, | |
features=datasets.Features( | |
{ | |
"predictions": datasets.Sequence(datasets.Value("string", id="label"), id="sequence"), | |
"references": datasets.Sequence(datasets.Value("string", id="label"), id="sequence"), | |
} | |
), | |
codebase_urls=["https://github.com/chakki-works/seqeval"], | |
reference_urls=["https://github.com/chakki-works/seqeval"], | |
) | |
def _compute( | |
self, | |
predictions, | |
references, | |
beta: float = 1.0, | |
suffix: bool = False, | |
scheme: Optional[str] = None, | |
mode: Optional[str] = None, | |
sample_weight: Optional[List[int]] = None, | |
zero_division: Union[str, int] = "warn", | |
): | |
if scheme is not None: | |
try: | |
scheme_module = importlib.import_module("seqeval.scheme") | |
scheme = getattr(scheme_module, scheme) | |
except AttributeError: | |
raise ValueError(f"Scheme should be one of [IOB1, IOB2, IOE1, IOE2, IOBES, BILOU], got {scheme}") | |
report = classification_report( | |
y_true=references, | |
y_pred=predictions, | |
suffix=suffix, | |
output_dict=True, | |
scheme=scheme, | |
mode=mode, | |
sample_weight=sample_weight, | |
zero_division=zero_division, | |
) | |
report.pop("macro avg") | |
report.pop("weighted avg") | |
if beta != 1.0: | |
beta2 = beta ** 2 | |
for k, v in report.items(): | |
denom = beta2 * v["precision"] + v["recall"] | |
if denom == 0: | |
denom += 1 | |
v[f"f{beta}-score"] = (1 + beta2) * v["precision"] * v["recall"] / denom | |
overall_score = report.pop("micro avg") | |
scores = { | |
type_name: { | |
"precision": score["precision"], | |
"recall": score["recall"], | |
"f1": score["f1-score"], | |
f"f{beta}": score[f"f{beta}-score"], | |
"number": score["support"], | |
} | |
for type_name, score in report.items() | |
} | |
scores["overall_precision"] = overall_score["precision"] | |
scores["overall_recall"] = overall_score["recall"] | |
scores["overall_f1"] = overall_score["f1-score"] | |
scores[f"overall_f{beta}"] = overall_score[f"f{beta}-score"] | |
scores["overall_accuracy"] = accuracy_score(y_true=references, y_pred=predictions) | |
return scores | |