Spaces:

sunnychenxiwang
/

EasyDetect

Sleeping

App Files Files Community

EasyDetect / pipeline /nltk /test /unit /translate /test_bleu.py

sunnychenxiwang

update nltk

d916065 over 1 year ago

raw

history blame

15.9 kB

	"""
	Tests for BLEU translation evaluation metric
	"""

	import io
	import unittest

	from nltk.data import find
	from nltk.translate.bleu_score import (
	SmoothingFunction,
	brevity_penalty,
	closest_ref_length,
	corpus_bleu,
	modified_precision,
	sentence_bleu,
	)


	class TestBLEU(unittest.TestCase):
	def test_modified_precision(self):
	"""
	Examples from the original BLEU paper
	https://www.aclweb.org/anthology/P02-1040.pdf
	"""
	# Example 1: the "the*" example.
	# Reference sentences.
	ref1 = "the cat is on the mat".split()
	ref2 = "there is a cat on the mat".split()
	# Hypothesis sentence(s).
	hyp1 = "the the the the the the the".split()

	references = [ref1, ref2]

	# Testing modified unigram precision.
	hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1))
	assert round(hyp1_unigram_precision, 4) == 0.2857
	# With assertAlmostEqual at 4 place precision.
	self.assertAlmostEqual(hyp1_unigram_precision, 0.28571428, places=4)

	# Testing modified bigram precision.
	assert float(modified_precision(references, hyp1, n=2)) == 0.0

	# Example 2: the "of the" example.
	# Reference sentences
	ref1 = str(
	"It is a guide to action that ensures that the military "
	"will forever heed Party commands"
	).split()
	ref2 = str(
	"It is the guiding principle which guarantees the military "
	"forces always being under the command of the Party"
	).split()
	ref3 = str(
	"It is the practical guide for the army always to heed "
	"the directions of the party"
	).split()
	# Hypothesis sentence(s).
	hyp1 = "of the".split()

	references = [ref1, ref2, ref3]
	# Testing modified unigram precision.
	assert float(modified_precision(references, hyp1, n=1)) == 1.0

	# Testing modified bigram precision.
	assert float(modified_precision(references, hyp1, n=2)) == 1.0

	# Example 3: Proper MT outputs.
	hyp1 = str(
	"It is a guide to action which ensures that the military "
	"always obeys the commands of the party"
	).split()
	hyp2 = str(
	"It is to insure the troops forever hearing the activity "
	"guidebook that party direct"
	).split()

	references = [ref1, ref2, ref3]

	# Unigram precision.
	hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1))
	hyp2_unigram_precision = float(modified_precision(references, hyp2, n=1))
	# Test unigram precision with assertAlmostEqual at 4 place precision.
	self.assertAlmostEqual(hyp1_unigram_precision, 0.94444444, places=4)
	self.assertAlmostEqual(hyp2_unigram_precision, 0.57142857, places=4)
	# Test unigram precision with rounding.
	assert round(hyp1_unigram_precision, 4) == 0.9444
	assert round(hyp2_unigram_precision, 4) == 0.5714

	# Bigram precision
	hyp1_bigram_precision = float(modified_precision(references, hyp1, n=2))
	hyp2_bigram_precision = float(modified_precision(references, hyp2, n=2))
	# Test bigram precision with assertAlmostEqual at 4 place precision.
	self.assertAlmostEqual(hyp1_bigram_precision, 0.58823529, places=4)
	self.assertAlmostEqual(hyp2_bigram_precision, 0.07692307, places=4)
	# Test bigram precision with rounding.
	assert round(hyp1_bigram_precision, 4) == 0.5882
	assert round(hyp2_bigram_precision, 4) == 0.0769

	def test_brevity_penalty(self):
	# Test case from brevity_penalty_closest function in mteval-v13a.pl.
	# Same test cases as in the doctest in nltk.translate.bleu_score.py
	references = [["a"] * 11, ["a"] * 8]
	hypothesis = ["a"] * 7
	hyp_len = len(hypothesis)
	closest_ref_len = closest_ref_length(references, hyp_len)
	self.assertAlmostEqual(
	brevity_penalty(closest_ref_len, hyp_len), 0.8669, places=4
	)

	references = [["a"] * 11, ["a"] * 8, ["a"] * 6, ["a"] * 7]
	hypothesis = ["a"] * 7
	hyp_len = len(hypothesis)
	closest_ref_len = closest_ref_length(references, hyp_len)
	assert brevity_penalty(closest_ref_len, hyp_len) == 1.0

	def test_zero_matches(self):
	# Test case where there's 0 matches
	references = ["The candidate has no alignment to any of the references".split()]
	hypothesis = "John loves Mary".split()

	# Test BLEU to nth order of n-grams, where n is len(hypothesis).
	for n in range(1, len(hypothesis)):
	weights = (1.0 / n,) * n # Uniform weights.
	assert sentence_bleu(references, hypothesis, weights) == 0

	def test_full_matches(self):
	# Test case where there's 100% matches
	references = ["John loves Mary".split()]
	hypothesis = "John loves Mary".split()

	# Test BLEU to nth order of n-grams, where n is len(hypothesis).
	for n in range(1, len(hypothesis)):
	weights = (1.0 / n,) * n # Uniform weights.
	assert sentence_bleu(references, hypothesis, weights) == 1.0

	def test_partial_matches_hypothesis_longer_than_reference(self):
	references = ["John loves Mary".split()]
	hypothesis = "John loves Mary who loves Mike".split()
	# Since no 4-grams matches were found the result should be zero
	# exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
	self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.0, places=4)
	# Checks that the warning has been raised because len(reference) < 4.
	try:
	self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
	except AttributeError:
	pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.


	# @unittest.skip("Skipping fringe cases for BLEU.")
	class TestBLEUFringeCases(unittest.TestCase):
	def test_case_where_n_is_bigger_than_hypothesis_length(self):
	# Test BLEU to nth order of n-grams, where n > len(hypothesis).
	references = ["John loves Mary ?".split()]
	hypothesis = "John loves Mary".split()
	n = len(hypothesis) + 1 #
	weights = (1.0 / n,) * n # Uniform weights.
	# Since no n-grams matches were found the result should be zero
	# exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
	self.assertAlmostEqual(
	sentence_bleu(references, hypothesis, weights), 0.0, places=4
	)
	# Checks that the warning has been raised because len(hypothesis) < 4.
	try:
	self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
	except AttributeError:
	pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.

	# Test case where n > len(hypothesis) but so is n > len(reference), and
	# it's a special case where reference == hypothesis.
	references = ["John loves Mary".split()]
	hypothesis = "John loves Mary".split()
	# Since no 4-grams matches were found the result should be zero
	# exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
	self.assertAlmostEqual(
	sentence_bleu(references, hypothesis, weights), 0.0, places=4
	)

	def test_empty_hypothesis(self):
	# Test case where there's hypothesis is empty.
	references = ["The candidate has no alignment to any of the references".split()]
	hypothesis = []
	assert sentence_bleu(references, hypothesis) == 0

	def test_length_one_hypothesis(self):
	# Test case where there's hypothesis is of length 1 in Smoothing method 4.
	references = ["The candidate has no alignment to any of the references".split()]
	hypothesis = ["Foo"]
	method4 = SmoothingFunction().method4
	try:
	sentence_bleu(references, hypothesis, smoothing_function=method4)
	except ValueError:
	pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.

	def test_empty_references(self):
	# Test case where there's reference is empty.
	references = [[]]
	hypothesis = "John loves Mary".split()
	assert sentence_bleu(references, hypothesis) == 0

	def test_empty_references_and_hypothesis(self):
	# Test case where both references and hypothesis is empty.
	references = [[]]
	hypothesis = []
	assert sentence_bleu(references, hypothesis) == 0

	def test_reference_or_hypothesis_shorter_than_fourgrams(self):
	# Test case where the length of reference or hypothesis
	# is shorter than 4.
	references = ["let it go".split()]
	hypothesis = "let go it".split()
	# Checks that the value the hypothesis and reference returns is 0.0
	# exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0
	self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.0, places=4)
	# Checks that the warning has been raised.
	try:
	self.assertWarns(UserWarning, sentence_bleu, references, hypothesis)
	except AttributeError:
	pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2.


	class TestBLEUvsMteval13a(unittest.TestCase):
	def test_corpus_bleu(self):
	ref_file = find("models/wmt15_eval/ref.ru")
	hyp_file = find("models/wmt15_eval/google.ru")
	mteval_output_file = find("models/wmt15_eval/mteval-13a.output")

	# Reads the BLEU scores from the `mteval-13a.output` file.
	# The order of the list corresponds to the order of the ngrams.
	with open(mteval_output_file) as mteval_fin:
	# The numbers are located in the last 2nd line of the file.
	# The first and 2nd item in the list are the score and system names.
	mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1])

	with open(ref_file, encoding="utf8") as ref_fin:
	with open(hyp_file, encoding="utf8") as hyp_fin:
	# Whitespace tokenize the file.
	# Note: split() automatically strip().
	hypothesis = list(map(lambda x: x.split(), hyp_fin))
	# Note that the corpus_bleu input is list of list of references.
	references = list(map(lambda x: [x.split()], ref_fin))
	# Without smoothing.
	for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores):
	nltk_bleu = corpus_bleu(
	references, hypothesis, weights=(1.0 / i,) * i
	)
	# Check that the BLEU scores difference is less than 0.005 .
	# Note: This is an approximate comparison; as much as
	# +/- 0.01 BLEU might be "statistically significant",
	# the actual translation quality might not be.
	assert abs(mteval_bleu - nltk_bleu) < 0.005

	# With the same smoothing method used in mteval-v13a.pl
	chencherry = SmoothingFunction()
	for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores):
	nltk_bleu = corpus_bleu(
	references,
	hypothesis,
	weights=(1.0 / i,) * i,
	smoothing_function=chencherry.method3,
	)
	assert abs(mteval_bleu - nltk_bleu) < 0.005


	class TestBLEUWithBadSentence(unittest.TestCase):
	def test_corpus_bleu_with_bad_sentence(self):
	hyp = "Teo S yb , oe uNb , R , T t , , t Tue Ar saln S , , 5istsi l , 5oe R ulO sae oR R"
	ref = str(
	"Their tasks include changing a pump on the faulty stokehold ."
	"Likewise , two species that are very similar in morphology "
	"were distinguished using genetics ."
	)
	references = [[ref.split()]]
	hypotheses = [hyp.split()]
	try: # Check that the warning is raised since no. of 2-grams < 0.
	with self.assertWarns(UserWarning):
	# Verify that the BLEU output is undesired since no. of 2-grams < 0.
	self.assertAlmostEqual(
	corpus_bleu(references, hypotheses), 0.0, places=4
	)
	except AttributeError: # unittest.TestCase.assertWarns is only supported in Python >= 3.2.
	self.assertAlmostEqual(corpus_bleu(references, hypotheses), 0.0, places=4)


	class TestBLEUWithMultipleWeights(unittest.TestCase):
	def test_corpus_bleu_with_multiple_weights(self):
	hyp1 = [
	"It",
	"is",
	"a",
	"guide",
	"to",
	"action",
	"which",
	"ensures",
	"that",
	"the",
	"military",
	"always",
	"obeys",
	"the",
	"commands",
	"of",
	"the",
	"party",
	]
	ref1a = [
	"It",
	"is",
	"a",
	"guide",
	"to",
	"action",
	"that",
	"ensures",
	"that",
	"the",
	"military",
	"will",
	"forever",
	"heed",
	"Party",
	"commands",
	]
	ref1b = [
	"It",
	"is",
	"the",
	"guiding",
	"principle",
	"which",
	"guarantees",
	"the",
	"military",
	"forces",
	"always",
	"being",
	"under",
	"the",
	"command",
	"of",
	"the",
	"Party",
	]
	ref1c = [
	"It",
	"is",
	"the",
	"practical",
	"guide",
	"for",
	"the",
	"army",
	"always",
	"to",
	"heed",
	"the",
	"directions",
	"of",
	"the",
	"party",
	]
	hyp2 = [
	"he",
	"read",
	"the",
	"book",
	"because",
	"he",
	"was",
	"interested",
	"in",
	"world",
	"history",
	]
	ref2a = [
	"he",
	"was",
	"interested",
	"in",
	"world",
	"history",
	"because",
	"he",
	"read",
	"the",
	"book",
	]
	weight_1 = (1, 0, 0, 0)
	weight_2 = (0.25, 0.25, 0.25, 0.25)
	weight_3 = (0, 0, 0, 0, 1)

	bleu_scores = corpus_bleu(
	list_of_references=[[ref1a, ref1b, ref1c], [ref2a]],
	hypotheses=[hyp1, hyp2],
	weights=[weight_1, weight_2, weight_3],
	)
	assert bleu_scores[0] == corpus_bleu(
	[[ref1a, ref1b, ref1c], [ref2a]], [hyp1, hyp2], weight_1
	)
	assert bleu_scores[1] == corpus_bleu(
	[[ref1a, ref1b, ref1c], [ref2a]], [hyp1, hyp2], weight_2
	)
	assert bleu_scores[2] == corpus_bleu(
	[[ref1a, ref1b, ref1c], [ref2a]], [hyp1, hyp2], weight_3
	)