Spaces:
Sleeping
Sleeping
File size: 8,978 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 |
# Natural Language Toolkit: ChrF score
#
# Copyright (C) 2001-2023 NLTK Project
# Authors: Maja Popovic
# Contributors: Liling Tan, Aleš Tamchyna (Memsource)
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
""" ChrF score implementation """
import re
from collections import Counter, defaultdict
from nltk.util import ngrams
def sentence_chrf(
reference, hypothesis, min_len=1, max_len=6, beta=3.0, ignore_whitespace=True
):
"""
Calculates the sentence level CHRF (Character n-gram F-score) described in
- Maja Popovic. 2015. CHRF: Character n-gram F-score for Automatic MT Evaluation.
In Proceedings of the 10th Workshop on Machine Translation.
https://www.statmt.org/wmt15/pdf/WMT49.pdf
- Maja Popovic. 2016. CHRF Deconstructed: β Parameters and n-gram Weights.
In Proceedings of the 1st Conference on Machine Translation.
https://www.statmt.org/wmt16/pdf/W16-2341.pdf
This implementation of CHRF only supports a single reference at the moment.
For details not reported in the paper, consult Maja Popovic's original
implementation: https://github.com/m-popovic/chrF
The code should output results equivalent to running CHRF++ with the
following options: -nw 0 -b 3
An example from the original BLEU paper
https://www.aclweb.org/anthology/P02-1040.pdf
>>> ref1 = str('It is a guide to action that ensures that the military '
... 'will forever heed Party commands').split()
>>> hyp1 = str('It is a guide to action which ensures that the military '
... 'always obeys the commands of the party').split()
>>> hyp2 = str('It is to insure the troops forever hearing the activity '
... 'guidebook that party direct').split()
>>> sentence_chrf(ref1, hyp1) # doctest: +ELLIPSIS
0.6349...
>>> sentence_chrf(ref1, hyp2) # doctest: +ELLIPSIS
0.3330...
The infamous "the the the ... " example
>>> ref = 'the cat is on the mat'.split()
>>> hyp = 'the the the the the the the'.split()
>>> sentence_chrf(ref, hyp) # doctest: +ELLIPSIS
0.1468...
An example to show that this function allows users to use strings instead of
tokens, i.e. list(str) as inputs.
>>> ref1 = str('It is a guide to action that ensures that the military '
... 'will forever heed Party commands')
>>> hyp1 = str('It is a guide to action which ensures that the military '
... 'always obeys the commands of the party')
>>> sentence_chrf(ref1, hyp1) # doctest: +ELLIPSIS
0.6349...
>>> type(ref1) == type(hyp1) == str
True
>>> sentence_chrf(ref1.split(), hyp1.split()) # doctest: +ELLIPSIS
0.6349...
To skip the unigrams and only use 2- to 3-grams:
>>> sentence_chrf(ref1, hyp1, min_len=2, max_len=3) # doctest: +ELLIPSIS
0.6617...
:param references: reference sentence
:type references: list(str) / str
:param hypothesis: a hypothesis sentence
:type hypothesis: list(str) / str
:param min_len: The minimum order of n-gram this function should extract.
:type min_len: int
:param max_len: The maximum order of n-gram this function should extract.
:type max_len: int
:param beta: the parameter to assign more importance to recall over precision
:type beta: float
:param ignore_whitespace: ignore whitespace characters in scoring
:type ignore_whitespace: bool
:return: the sentence level CHRF score.
:rtype: float
"""
return corpus_chrf(
[reference],
[hypothesis],
min_len,
max_len,
beta=beta,
ignore_whitespace=ignore_whitespace,
)
def _preprocess(sent, ignore_whitespace):
if type(sent) != str:
# turn list of tokens into a string
sent = " ".join(sent)
if ignore_whitespace:
sent = re.sub(r"\s+", "", sent)
return sent
def chrf_precision_recall_fscore_support(
reference, hypothesis, n, beta=3.0, epsilon=1e-16
):
"""
This function computes the precision, recall and fscore from the ngram
overlaps. It returns the `support` which is the true positive score.
By underspecifying the input type, the function will be agnostic as to how
it computes the ngrams and simply take the whichever element in the list;
it could be either token or character.
:param reference: The reference sentence.
:type reference: list
:param hypothesis: The hypothesis sentence.
:type hypothesis: list
:param n: Extract up to the n-th order ngrams
:type n: int
:param beta: The parameter to assign more importance to recall over precision.
:type beta: float
:param epsilon: The fallback value if the hypothesis or reference is empty.
:type epsilon: float
:return: Returns the precision, recall and f-score and support (true positive).
:rtype: tuple(float)
"""
ref_ngrams = Counter(ngrams(reference, n))
hyp_ngrams = Counter(ngrams(hypothesis, n))
# calculate the number of ngram matches
overlap_ngrams = ref_ngrams & hyp_ngrams
tp = sum(overlap_ngrams.values()) # True positives.
tpfp = sum(hyp_ngrams.values()) # True positives + False positives.
tpfn = sum(ref_ngrams.values()) # True positives + False negatives.
try:
prec = tp / tpfp # precision
rec = tp / tpfn # recall
factor = beta**2
fscore = (1 + factor) * (prec * rec) / (factor * prec + rec)
except ZeroDivisionError:
prec = rec = fscore = epsilon
return prec, rec, fscore, tp
def corpus_chrf(
references, hypotheses, min_len=1, max_len=6, beta=3.0, ignore_whitespace=True
):
"""
Calculates the corpus level CHRF (Character n-gram F-score), it is the
macro-averaged value of the sentence/segment level CHRF score.
This implementation of CHRF only supports a single reference at the moment.
>>> ref1 = str('It is a guide to action that ensures that the military '
... 'will forever heed Party commands').split()
>>> ref2 = str('It is the guiding principle which guarantees the military '
... 'forces always being under the command of the Party').split()
>>>
>>> hyp1 = str('It is a guide to action which ensures that the military '
... 'always obeys the commands of the party').split()
>>> hyp2 = str('It is to insure the troops forever hearing the activity '
... 'guidebook that party direct')
>>> corpus_chrf([ref1, ref2, ref1, ref2], [hyp1, hyp2, hyp2, hyp1]) # doctest: +ELLIPSIS
0.3910...
:param references: a corpus of list of reference sentences, w.r.t. hypotheses
:type references: list(list(str))
:param hypotheses: a list of hypothesis sentences
:type hypotheses: list(list(str))
:param min_len: The minimum order of n-gram this function should extract.
:type min_len: int
:param max_len: The maximum order of n-gram this function should extract.
:type max_len: int
:param beta: the parameter to assign more importance to recall over precision
:type beta: float
:param ignore_whitespace: ignore whitespace characters in scoring
:type ignore_whitespace: bool
:return: the sentence level CHRF score.
:rtype: float
"""
assert len(references) == len(
hypotheses
), "The number of hypotheses and their references should be the same"
num_sents = len(hypotheses)
# Keep f-scores for each n-gram order separate
ngram_fscores = defaultdict(lambda: list())
# Iterate through each hypothesis and their corresponding references.
for reference, hypothesis in zip(references, hypotheses):
# preprocess both reference and hypothesis
reference = _preprocess(reference, ignore_whitespace)
hypothesis = _preprocess(hypothesis, ignore_whitespace)
# Calculate f-scores for each sentence and for each n-gram order
# separately.
for n in range(min_len, max_len + 1):
# Compute the precision, recall, fscore and support.
prec, rec, fscore, tp = chrf_precision_recall_fscore_support(
reference, hypothesis, n, beta=beta
)
ngram_fscores[n].append(fscore)
# how many n-gram sizes
num_ngram_sizes = len(ngram_fscores)
# sum of f-scores over all sentences for each n-gram order
total_scores = [sum(fscores) for n, fscores in ngram_fscores.items()]
# macro-average over n-gram orders and over all sentences
return (sum(total_scores) / num_ngram_sizes) / num_sents
|