Spaces:
Running
Running
# Copyright 2020 The HuggingFace Evaluate Authors. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
""" CoVal metric. """ | |
import coval # From: git+https://github.com/ns-moosavi/coval.git noqa: F401 | |
import datasets | |
from coval.conll import reader, util | |
from coval.eval import evaluator | |
import evaluate | |
logger = evaluate.logging.get_logger(__name__) | |
_CITATION = """\ | |
@InProceedings{moosavi2019minimum, | |
author = { Nafise Sadat Moosavi, Leo Born, Massimo Poesio and Michael Strube}, | |
title = {Using Automatically Extracted Minimum Spans to Disentangle Coreference Evaluation from Boundary Detection}, | |
year = {2019}, | |
booktitle = {Proceedings of the 57th Annual Meeting of | |
the Association for Computational Linguistics (Volume 1: Long Papers)}, | |
publisher = {Association for Computational Linguistics}, | |
address = {Florence, Italy}, | |
} | |
@inproceedings{10.3115/1072399.1072405, | |
author = {Vilain, Marc and Burger, John and Aberdeen, John and Connolly, Dennis and Hirschman, Lynette}, | |
title = {A Model-Theoretic Coreference Scoring Scheme}, | |
year = {1995}, | |
isbn = {1558604022}, | |
publisher = {Association for Computational Linguistics}, | |
address = {USA}, | |
url = {https://doi.org/10.3115/1072399.1072405}, | |
doi = {10.3115/1072399.1072405}, | |
booktitle = {Proceedings of the 6th Conference on Message Understanding}, | |
pages = {45–52}, | |
numpages = {8}, | |
location = {Columbia, Maryland}, | |
series = {MUC6 ’95} | |
} | |
@INPROCEEDINGS{Bagga98algorithmsfor, | |
author = {Amit Bagga and Breck Baldwin}, | |
title = {Algorithms for Scoring Coreference Chains}, | |
booktitle = {In The First International Conference on Language Resources and Evaluation Workshop on Linguistics Coreference}, | |
year = {1998}, | |
pages = {563--566} | |
} | |
@INPROCEEDINGS{Luo05oncoreference, | |
author = {Xiaoqiang Luo}, | |
title = {On coreference resolution performance metrics}, | |
booktitle = {In Proc. of HLT/EMNLP}, | |
year = {2005}, | |
pages = {25--32}, | |
publisher = {URL} | |
} | |
@inproceedings{moosavi-strube-2016-coreference, | |
title = "Which Coreference Evaluation Metric Do You Trust? A Proposal for a Link-based Entity Aware Metric", | |
author = "Moosavi, Nafise Sadat and | |
Strube, Michael", | |
booktitle = "Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", | |
month = aug, | |
year = "2016", | |
address = "Berlin, Germany", | |
publisher = "Association for Computational Linguistics", | |
url = "https://www.aclweb.org/anthology/P16-1060", | |
doi = "10.18653/v1/P16-1060", | |
pages = "632--642", | |
} | |
""" | |
_DESCRIPTION = """\ | |
CoVal is a coreference evaluation tool for the CoNLL and ARRAU datasets which | |
implements of the common evaluation metrics including MUC [Vilain et al, 1995], | |
B-cubed [Bagga and Baldwin, 1998], CEAFe [Luo et al., 2005], | |
LEA [Moosavi and Strube, 2016] and the averaged CoNLL score | |
(the average of the F1 values of MUC, B-cubed and CEAFe) | |
[Denis and Baldridge, 2009a; Pradhan et al., 2011]. | |
This wrapper of CoVal currently only work with CoNLL line format: | |
The CoNLL format has one word per line with all the annotation for this word in column separated by spaces: | |
Column Type Description | |
1 Document ID This is a variation on the document filename | |
2 Part number Some files are divided into multiple parts numbered as 000, 001, 002, ... etc. | |
3 Word number | |
4 Word itself This is the token as segmented/tokenized in the Treebank. Initially the *_skel file contain the placeholder [WORD] which gets replaced by the actual token from the Treebank which is part of the OntoNotes release. | |
5 Part-of-Speech | |
6 Parse bit This is the bracketed structure broken before the first open parenthesis in the parse, and the word/part-of-speech leaf replaced with a *. The full parse can be created by substituting the asterix with the "([pos] [word])" string (or leaf) and concatenating the items in the rows of that column. | |
7 Predicate lemma The predicate lemma is mentioned for the rows for which we have semantic role information. All other rows are marked with a "-" | |
8 Predicate Frameset ID This is the PropBank frameset ID of the predicate in Column 7. | |
9 Word sense This is the word sense of the word in Column 3. | |
10 Speaker/Author This is the speaker or author name where available. Mostly in Broadcast Conversation and Web Log data. | |
11 Named Entities These columns identifies the spans representing various named entities. | |
12:N Predicate Arguments There is one column each of predicate argument structure information for the predicate mentioned in Column 7. | |
N Coreference Coreference chain information encoded in a parenthesis structure. | |
More informations on the format can be found here (section "*_conll File Format"): http://www.conll.cemantix.org/2012/data.html | |
Details on the evaluation on CoNLL can be found here: https://github.com/ns-moosavi/coval/blob/master/conll/README.md | |
CoVal code was written by @ns-moosavi. | |
Some parts are borrowed from https://github.com/clarkkev/deep-coref/blob/master/evaluation.py | |
The test suite is taken from https://github.com/conll/reference-coreference-scorers/ | |
Mention evaluation and the test suite are added by @andreasvc. | |
Parsing CoNLL files is developed by Leo Born. | |
""" | |
_KWARGS_DESCRIPTION = """ | |
Calculates coreference evaluation metrics. | |
Args: | |
predictions: list of sentences. Each sentence is a list of word predictions to score in the CoNLL format. | |
Each prediction is a word with its annotations as a string made of columns joined with spaces. | |
Only columns 4, 5, 6 and the last column are used (word, POS, Pars and coreference annotation) | |
See the details on the format in the description of the metric. | |
references: list of sentences. Each sentence is a list of word reference to score in the CoNLL format. | |
Each reference is a word with its annotations as a string made of columns joined with spaces. | |
Only columns 4, 5, 6 and the last column are used (word, POS, Pars and coreference annotation) | |
See the details on the format in the description of the metric. | |
keep_singletons: After extracting all mentions of key or system files, | |
mentions whose corresponding coreference chain is of size one, | |
are considered as singletons. The default evaluation mode will include | |
singletons in evaluations if they are included in the key or the system files. | |
By setting 'keep_singletons=False', all singletons in the key and system files | |
will be excluded from the evaluation. | |
NP_only: Most of the recent coreference resolvers only resolve NP mentions and | |
leave out the resolution of VPs. By setting the 'NP_only' option, the scorer will only evaluate the resolution of NPs. | |
min_span: By setting 'min_span', the scorer reports the results based on automatically detected minimum spans. | |
Minimum spans are determined using the MINA algorithm. | |
Returns: | |
'mentions': mentions | |
'muc': MUC metric [Vilain et al, 1995] | |
'bcub': B-cubed [Bagga and Baldwin, 1998] | |
'ceafe': CEAFe [Luo et al., 2005] | |
'lea': LEA [Moosavi and Strube, 2016] | |
'conll_score': averaged CoNLL score (the average of the F1 values of MUC, B-cubed and CEAFe) | |
Examples: | |
>>> coval = evaluate.load('coval') | |
>>> words = ['bc/cctv/00/cctv_0005 0 0 Thank VBP (TOP(S(VP* thank 01 1 Xu_li * (V*) * -', | |
... 'bc/cctv/00/cctv_0005 0 1 you PRP (NP*) - - - Xu_li * (ARG1*) (ARG0*) (116)', | |
... 'bc/cctv/00/cctv_0005 0 2 everyone NN (NP*) - - - Xu_li * (ARGM-DIS*) * (116)', | |
... 'bc/cctv/00/cctv_0005 0 3 for IN (PP* - - - Xu_li * (ARG2* * -', | |
... 'bc/cctv/00/cctv_0005 0 4 watching VBG (S(VP*)))) watch 01 1 Xu_li * *) (V*) -', | |
... 'bc/cctv/00/cctv_0005 0 5 . . *)) - - - Xu_li * * * -'] | |
>>> references = [words] | |
>>> predictions = [words] | |
>>> results = coval.compute(predictions=predictions, references=references) | |
>>> print(results) # doctest:+ELLIPSIS | |
{'mentions/recall': 1.0,[...] 'conll_score': 100.0} | |
""" | |
def get_coref_infos( | |
key_lines, sys_lines, NP_only=False, remove_nested=False, keep_singletons=True, min_span=False, doc="dummy_doc" | |
): | |
key_doc_lines = {doc: key_lines} | |
sys_doc_lines = {doc: sys_lines} | |
doc_coref_infos = {} | |
key_nested_coref_num = 0 | |
sys_nested_coref_num = 0 | |
key_removed_nested_clusters = 0 | |
sys_removed_nested_clusters = 0 | |
key_singletons_num = 0 | |
sys_singletons_num = 0 | |
key_clusters, singletons_num = reader.get_doc_mentions(doc, key_doc_lines[doc], keep_singletons) | |
key_singletons_num += singletons_num | |
if NP_only or min_span: | |
key_clusters = reader.set_annotated_parse_trees(key_clusters, key_doc_lines[doc], NP_only, min_span) | |
sys_clusters, singletons_num = reader.get_doc_mentions(doc, sys_doc_lines[doc], keep_singletons) | |
sys_singletons_num += singletons_num | |
if NP_only or min_span: | |
sys_clusters = reader.set_annotated_parse_trees(sys_clusters, key_doc_lines[doc], NP_only, min_span) | |
if remove_nested: | |
nested_mentions, removed_clusters = reader.remove_nested_coref_mentions(key_clusters, keep_singletons) | |
key_nested_coref_num += nested_mentions | |
key_removed_nested_clusters += removed_clusters | |
nested_mentions, removed_clusters = reader.remove_nested_coref_mentions(sys_clusters, keep_singletons) | |
sys_nested_coref_num += nested_mentions | |
sys_removed_nested_clusters += removed_clusters | |
sys_mention_key_cluster = reader.get_mention_assignments(sys_clusters, key_clusters) | |
key_mention_sys_cluster = reader.get_mention_assignments(key_clusters, sys_clusters) | |
doc_coref_infos[doc] = (key_clusters, sys_clusters, key_mention_sys_cluster, sys_mention_key_cluster) | |
if remove_nested: | |
logger.info( | |
"Number of removed nested coreferring mentions in the key " | |
f"annotation: {key_nested_coref_num}; and system annotation: {sys_nested_coref_num}" | |
) | |
logger.info( | |
"Number of resulting singleton clusters in the key " | |
f"annotation: {key_removed_nested_clusters}; and system annotation: {sys_removed_nested_clusters}" | |
) | |
if not keep_singletons: | |
logger.info( | |
f"{key_singletons_num:d} and {sys_singletons_num:d} singletons are removed from the key and system " | |
"files, respectively" | |
) | |
return doc_coref_infos | |
def compute_score(key_lines, sys_lines, metrics, NP_only, remove_nested, keep_singletons, min_span): | |
doc_coref_infos = get_coref_infos(key_lines, sys_lines, NP_only, remove_nested, keep_singletons, min_span) | |
output_scores = {} | |
conll = 0 | |
conll_subparts_num = 0 | |
for name, metric in metrics: | |
recall, precision, f1 = evaluator.evaluate_documents(doc_coref_infos, metric, beta=1) | |
if name in ["muc", "bcub", "ceafe"]: | |
conll += f1 | |
conll_subparts_num += 1 | |
output_scores.update({f"{name}/recall": recall, f"{name}/precision": precision, f"{name}/f1": f1}) | |
logger.info( | |
name.ljust(10), | |
f"Recall: {recall * 100:.2f}", | |
f" Precision: {precision * 100:.2f}", | |
f" F1: {f1 * 100:.2f}", | |
) | |
if conll_subparts_num == 3: | |
conll = (conll / 3) * 100 | |
logger.info(f"CoNLL score: {conll:.2f}") | |
output_scores.update({"conll_score": conll}) | |
return output_scores | |
def check_gold_parse_annotation(key_lines): | |
has_gold_parse = False | |
for line in key_lines: | |
if not line.startswith("#"): | |
if len(line.split()) > 6: | |
parse_col = line.split()[5] | |
if not parse_col == "-": | |
has_gold_parse = True | |
break | |
else: | |
break | |
return has_gold_parse | |
class Coval(evaluate.EvaluationModule): | |
def _info(self): | |
return evaluate.EvaluationModuleInfo( | |
description=_DESCRIPTION, | |
citation=_CITATION, | |
inputs_description=_KWARGS_DESCRIPTION, | |
features=datasets.Features( | |
{ | |
"predictions": datasets.Sequence(datasets.Value("string")), | |
"references": datasets.Sequence(datasets.Value("string")), | |
} | |
), | |
codebase_urls=["https://github.com/ns-moosavi/coval"], | |
reference_urls=[ | |
"https://github.com/ns-moosavi/coval", | |
"https://www.aclweb.org/anthology/P16-1060", | |
"http://www.conll.cemantix.org/2012/data.html", | |
], | |
) | |
def _compute( | |
self, predictions, references, keep_singletons=True, NP_only=False, min_span=False, remove_nested=False | |
): | |
allmetrics = [ | |
("mentions", evaluator.mentions), | |
("muc", evaluator.muc), | |
("bcub", evaluator.b_cubed), | |
("ceafe", evaluator.ceafe), | |
("lea", evaluator.lea), | |
] | |
if min_span: | |
has_gold_parse = util.check_gold_parse_annotation(references) | |
if not has_gold_parse: | |
raise NotImplementedError("References should have gold parse annotation to use 'min_span'.") | |
# util.parse_key_file(key_file) | |
# key_file = key_file + ".parsed" | |
score = compute_score( | |
key_lines=references, | |
sys_lines=predictions, | |
metrics=allmetrics, | |
NP_only=NP_only, | |
remove_nested=remove_nested, | |
keep_singletons=keep_singletons, | |
min_span=min_span, | |
) | |
return score | |