Spaces:
Sleeping
Sleeping
File size: 6,931 Bytes
94ed10f 113b180 94ed10f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
# Copyright 2020 The HuggingFace Evaluate Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" seqeval metric. """
import importlib
from typing import List, Optional, Union
import datasets
from seqeval.metrics import accuracy_score, classification_report
import evaluate
_CITATION = """\
@inproceedings{ramshaw-marcus-1995-text,
title = "Text Chunking using Transformation-Based Learning",
author = "Ramshaw, Lance and
Marcus, Mitch",
booktitle = "Third Workshop on Very Large Corpora",
year = "1995",
url = "https://www.aclweb.org/anthology/W95-0107",
}
@misc{seqeval,
title={{seqeval}: A Python framework for sequence labeling evaluation},
url={https://github.com/chakki-works/seqeval},
note={Software available from https://github.com/chakki-works/seqeval},
author={Hiroki Nakayama},
year={2018},
}
"""
_DESCRIPTION = """\
seqeval is a Python framework for sequence labeling evaluation.
seqeval can evaluate the performance of chunking tasks such as named-entity recognition, part-of-speech tagging, semantic role labeling and so on.
This is well-tested by using the Perl script conlleval, which can be used for
measuring the performance of a system that has processed the CoNLL-2000 shared task data.
seqeval supports following formats:
IOB1
IOB2
IOE1
IOE2
IOBES
See the [README.md] file at https://github.com/chakki-works/seqeval for more information.
"""
_KWARGS_DESCRIPTION = """
Produces labelling scores along with its sufficient statistics
from a source against one or more references.
Args:
predictions: List of List of predicted labels (Estimated targets as returned by a tagger)
references: List of List of reference labels (Ground truth (correct) target values)
beta: Weight for the F-score
suffix: True if the IOB prefix is after type, False otherwise. default: False
scheme: Specify target tagging scheme. Should be one of ["IOB1", "IOB2", "IOE1", "IOE2", "IOBES", "BILOU"].
default: None
mode: Whether to count correct entity labels with incorrect I/B tags as true positives or not.
If you want to only count exact matches, pass mode="strict". default: None.
sample_weight: Array-like of shape (n_samples,), weights for individual samples. default: None
zero_division: Which value to substitute as a metric value when encountering zero division. Should be on of 0, 1,
"warn". "warn" acts as 0, but the warning is raised.
Returns:
'scores': dict. Summary of the scores for overall and per type
Overall:
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': F1 score, also known as balanced F-score or F-measure,
'fbeta': F-score with weight beta
Per type:
'precision': precision,
'recall': recall,
'f1': F1 score, also known as balanced F-score or F-measure,
'fbeta': F-score with weight beta
Examples:
>>> predictions = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
>>> references = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
>>> seqeval = evaluate.load("seqeval")
>>> results = seqeval.compute(predictions=predictions, references=references, beta=1.0)
>>> print(list(results.keys()))
['MISC', 'PER', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy']
>>> print(results["overall_f1"])
0.5
>>> print(results["PER"]["f1"])
1.0
"""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Seqeval(evaluate.Metric):
def _info(self):
return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
homepage="https://github.com/chakki-works/seqeval",
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features(
{
"predictions": datasets.Sequence(datasets.Value("string", id="label"), id="sequence"),
"references": datasets.Sequence(datasets.Value("string", id="label"), id="sequence"),
}
),
codebase_urls=["https://github.com/chakki-works/seqeval"],
reference_urls=["https://github.com/chakki-works/seqeval"],
)
def _compute(
self,
predictions,
references,
beta: float = 1.0,
suffix: bool = False,
scheme: Optional[str] = None,
mode: Optional[str] = None,
sample_weight: Optional[List[int]] = None,
zero_division: Union[str, int] = "warn",
):
if scheme is not None:
try:
scheme_module = importlib.import_module("seqeval.scheme")
scheme = getattr(scheme_module, scheme)
except AttributeError:
raise ValueError(f"Scheme should be one of [IOB1, IOB2, IOE1, IOE2, IOBES, BILOU], got {scheme}")
report = classification_report(
y_true=references,
y_pred=predictions,
suffix=suffix,
output_dict=True,
scheme=scheme,
mode=mode,
sample_weight=sample_weight,
zero_division=zero_division,
)
report.pop("macro avg")
report.pop("weighted avg")
if beta != 1.0:
beta2 = beta ** 2
for k, v in report.items():
denom = beta2 * v["precision"] + v["recall"]
if denom == 0:
denom += 1
v[f"f{beta}-score"] = (1 + beta2) * v["precision"] * v["recall"] / denom
overall_score = report.pop("micro avg")
scores = {
type_name: {
"precision": score["precision"],
"recall": score["recall"],
"f1": score["f1-score"],
f"f{beta}": score[f"f{beta}-score"],
"number": score["support"],
}
for type_name, score in report.items()
}
scores["overall_precision"] = overall_score["precision"]
scores["overall_recall"] = overall_score["recall"]
scores["overall_f1"] = overall_score["f1-score"]
scores[f"overall_f{beta}"] = overall_score[f"f{beta}-score"]
scores["overall_accuracy"] = accuracy_score(y_true=references, y_pred=predictions)
return scores
|