Spaces:

akhaliq
/

SummerTime

Build error

SummerTime / model /third_party /HMNet /Evaluation /OldROUGEEval.py

akhaliq3

spaces demo

546a9ba over 3 years ago

13.5 kB

	# Copyright (c) Microsoft Corporation.
	# Licensed under the MIT license.

	"""ROUGe metric implementation.

	This is a modified and slightly extended verison of
	https://github.com/miso-belica/sumy/blob/dev/sumy/evaluation/rouge.py.
	"""

	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function
	from __future__ import unicode_literals

	import itertools
	import numpy as np

	# pylint: disable=C0103


	def _get_ngrams(n, text):
	"""Calcualtes n-grams.

	Args:
	n: which n-grams to calculate
	text: An array of tokens

	Returns:
	A set of n-grams
	"""
	ngram_set = {}
	text_length = len(text)
	max_index_ngram_start = text_length - n
	for i in range(max_index_ngram_start + 1):
	k = " ".join(text[i : i + n])
	if k not in ngram_set:
	ngram_set[k] = 0
	ngram_set[k] += 1
	return ngram_set


	def _get_su(dist, text):
	"""Calcualtes skip-grams and unigram

	Args:
	n: which n-grams to calculate
	text: An array of tokens

	Returns:
	A set of n-grams
	"""
	su_set = {}
	text_length = len(text)
	for i in range(text_length):
	k = text[i]
	if k not in su_set:
	su_set[k] = 0
	su_set[k] += 1
	for j in range(i + 1, text_length):
	if j - i - 1 > dist:
	break
	k = text[i] + " " + text[j]
	if k not in su_set:
	su_set[k] = 0
	su_set[k] += 1
	return su_set


	def _split_into_words(sentences):
	"""Splits multiple sentences into words and flattens the result"""
	return list(itertools.chain(*[_.split(" ") for _ in sentences]))


	def _get_word_ngrams(n, sentences):
	"""Calculates word n-grams for multiple sentences."""
	assert len(sentences) > 0
	assert n > 0

	words = _split_into_words(sentences)
	return _get_ngrams(n, words)


	def _get_word_su(dist, sentences):
	"""Calculates word skip-dist-grams for multiple sentences."""
	assert len(sentences) > 0
	assert dist > 0

	words = _split_into_words(sentences)
	return _get_su(dist, words)


	def _len_lcs(x, y):
	"""
	Returns the length of the Longest Common Subsequence between sequences x
	and y.
	Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence

	Args:
	x: sequence of words
	y: sequence of words

	Returns
	integer: Length of LCS between x and y
	"""
	table = _lcs(x, y)
	n, m = len(x), len(y)
	return table[n, m]


	def _lcs(x, y):
	"""
	Computes the length of the longest common subsequence (lcs) between two
	strings. The implementation below uses a DP programming algorithm and runs
	in O(nm) time where n = len(x) and m = len(y).
	Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence

	Args:
	x: collection of words
	y: collection of words

	Returns:
	Table of dictionary of coord and len lcs
	"""
	n, m = len(x), len(y)
	table = dict()
	for i in range(n + 1):
	for j in range(m + 1):
	if i == 0 or j == 0:
	table[i, j] = 0
	elif x[i - 1] == y[j - 1]:
	table[i, j] = table[i - 1, j - 1] + 1
	else:
	table[i, j] = max(table[i - 1, j], table[i, j - 1])
	return table


	def _recon_lcs(x, y):
	"""
	Returns the Longest Subsequence between x and y.
	Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence

	Args:
	x: sequence of words
	y: sequence of words

	Returns:
	sequence: LCS of x and y
	"""
	i, j = len(x), len(y)
	table = _lcs(x, y)

	def _recon(i, j):
	"""private recon calculation"""
	if i == 0 or j == 0:
	return []
	elif x[i - 1] == y[j - 1]:
	return _recon(i - 1, j - 1) + [(x[i - 1], i)]
	elif table[i - 1, j] > table[i, j - 1]:
	return _recon(i - 1, j)
	else:
	return _recon(i, j - 1)

	recon_tuple = tuple(map(lambda x: x[0], _recon(i, j)))
	return recon_tuple


	def rouge_su(evaluated_sentences, reference_sentences, dist=4):
	"""
	Computes ROUGE-SU_dist of two text collections of sentences.
	Sourece: http://research.microsoft.com/en-us/um/people/cyl/download/
	papers/rouge-working-note-v1.3.1.pdf

	Args:
	evaluated_sentences: The sentences that have been picked by the summarizer
	reference_sentences: The sentences from the referene set
	n: maximum distance between two tokens. Defaults to 4.

	Returns:
	A tuple (f1, precision, recall) for ROUGE-SU4

	Raises:
	ValueError: raises exception if a param has len <= 0
	"""
	return rouge_n(evaluated_sentences, reference_sentences, dist=dist, su=True)


	def rouge_n(evaluated_sentences, reference_sentences, n=2, dist=4, su=False):
	"""
	Computes ROUGE-N of two text collections of sentences.
	Sourece: http://research.microsoft.com/en-us/um/people/cyl/download/
	papers/rouge-working-note-v1.3.1.pdf

	Args:
	evaluated_sentences: The sentences that have been picked by the summarizer
	reference_sentences: The sentences from the referene set
	n: Size of ngram. Defaults to 2.
	su: if true, we are computing rouge_su

	Returns:
	A tuple (f1, precision, recall) for ROUGE-N

	Raises:
	ValueError: raises exception if a param has len <= 0
	"""
	if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
	raise ValueError("Collections must contain at least 1 sentence.")

	if su == True:
	evaluated_ngrams = _get_word_su(dist, evaluated_sentences)
	reference_ngrams = _get_word_su(dist, reference_sentences)
	else:
	evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences)
	reference_ngrams = _get_word_ngrams(n, reference_sentences)

	reference_count = sum([v for k, v in reference_ngrams.items()])
	evaluated_count = sum([v for k, v in evaluated_ngrams.items()])

	# Gets the overlapping ngrams between evaluated and reference
	overlapping_count = 0
	for k, v in reference_ngrams.items():
	if k in evaluated_ngrams:
	if evaluated_ngrams[k] < v:
	overlapping_count += evaluated_ngrams[k]
	else:
	overlapping_count += v

	# Handle edge case. This isn't mathematically correct, but it's good enough
	if evaluated_count == 0:
	precision = 0.0
	else:
	precision = overlapping_count / evaluated_count

	if reference_count == 0:
	recall = 0.0
	else:
	recall = overlapping_count / reference_count

	f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8))

	# return overlapping_count / reference_count
	return f1_score, precision, recall


	def _f_p_r_lcs(llcs, m, n):
	"""
	Computes the LCS-based F-measure score
	Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/
	rouge-working-note-v1.3.1.pdf

	Args:
	llcs: Length of LCS
	m: number of words in reference summary
	n: number of words in candidate summary

	Returns:
	Float. LCS-based F-measure score
	"""
	r_lcs = llcs / m
	p_lcs = llcs / n
	beta = p_lcs / (r_lcs + 1e-12)
	num = (1 + (beta ** 2)) * r_lcs * p_lcs
	denom = r_lcs + ((beta ** 2) * p_lcs)
	f_lcs = num / (denom + 1e-12)
	return f_lcs, p_lcs, r_lcs


	def rouge_l_sentence_level(evaluated_sentences, reference_sentences):
	"""
	Computes ROUGE-L (sentence level) of two text collections of sentences.
	http://research.microsoft.com/en-us/um/people/cyl/download/papers/
	rouge-working-note-v1.3.1.pdf

	Calculated according to:
	R_lcs = LCS(X,Y)/m
	P_lcs = LCS(X,Y)/n
	F_lcs = ((1 + beta^2)R_lcsP_lcs) / (R_lcs + (beta^2) * P_lcs)

	where:
	X = reference summary
	Y = Candidate summary
	m = length of reference summary
	n = length of candidate summary

	Args:
	evaluated_sentences: The sentences that have been picked by the summarizer
	reference_sentences: The sentences from the referene set

	Returns:
	A float: F_lcs

	Raises:
	ValueError: raises exception if a param has len <= 0
	"""
	if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
	raise ValueError("Collections must contain at least 1 sentence.")
	reference_words = _split_into_words(reference_sentences)
	evaluated_words = _split_into_words(evaluated_sentences)
	m = len(reference_words)
	n = len(evaluated_words)
	lcs = _len_lcs(evaluated_words, reference_words)
	return _f_p_r_lcs(lcs, m, n)


	def _union_lcs(evaluated_sentences, reference_sentence):
	"""
	Returns LCS_u(r_i, C) which is the LCS score of the union longest common
	subsequence between reference sentence ri and candidate summary C. For example
	if r_i= w1 w2 w3 w4 w5, and C contains two sentences: c1 = w1 w2 w6 w7 w8 and
	c2 = w1 w3 w8 w9 w5, then the longest common subsequence of r_i and c1 is
	“w1 w2” and the longest common subsequence of r_i and c2 is “w1 w3 w5”. The
	union longest common subsequence of r_i, c1, and c2 is “w1 w2 w3 w5” and
	LCS_u(r_i, C) = 4/5.

	Args:
	evaluated_sentences: The sentences that have been picked by the summarizer
	reference_sentence: One of the sentences in the reference summaries

	Returns:
	float: LCS_u(r_i, C)

	ValueError:
	Raises exception if a param has len <= 0
	"""
	if len(evaluated_sentences) <= 0:
	raise ValueError("Collections must contain at least 1 sentence.")

	lcs_union = set()
	reference_words = _split_into_words([reference_sentence])
	combined_lcs_length = 0
	for eval_s in evaluated_sentences:
	evaluated_words = _split_into_words([eval_s])
	lcs = set(_recon_lcs(reference_words, evaluated_words))
	combined_lcs_length += len(lcs)
	lcs_union = lcs_union.union(lcs)

	union_lcs_count = len(lcs_union)
	union_lcs_value = union_lcs_count / combined_lcs_length
	return union_lcs_value


	def rouge_l_summary_level(evaluated_sentences, reference_sentences):
	"""
	Computes ROUGE-L (summary level) of two text collections of sentences.
	http://research.microsoft.com/en-us/um/people/cyl/download/papers/
	rouge-working-note-v1.3.1.pdf

	Calculated according to:
	R_lcs = SUM(1, u)[LCS<union>(r_i,C)]/m
	P_lcs = SUM(1, u)[LCS<union>(r_i,C)]/n
	F_lcs = ((1 + beta^2)R_lcsP_lcs) / (R_lcs + (beta^2) * P_lcs)

	where:
	SUM(i,u) = SUM from i through u
	u = number of sentences in reference summary
	C = Candidate summary made up of v sentences
	m = number of words in reference summary
	n = number of words in candidate summary

	Args:
	evaluated_sentences: The sentences that have been picked by the summarizer
	reference_sentence: One of the sentences in the reference summaries

	Returns:
	A float: F_lcs

	Raises:
	ValueError: raises exception if a param has len <= 0
	"""
	if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
	raise ValueError("Collections must contain at least 1 sentence.")

	# total number of words in reference sentences
	m = len(_split_into_words(reference_sentences))

	# total number of words in evaluated sentences
	n = len(_split_into_words(evaluated_sentences))

	union_lcs_sum_across_all_references = 0
	for ref_s in reference_sentences:
	union_lcs_sum_across_all_references += _union_lcs(evaluated_sentences, ref_s)
	return _f_p_r_lcs(union_lcs_sum_across_all_references, m, n)


	def rouge(hypotheses, references):
	"""Calculates average rouge scores for a list of hypotheses and
	references"""

	# Filter out hyps that are of 0 length
	# hyps_and_refs = zip(hypotheses, references)
	# hyps_and_refs = [_ for _ in hyps_and_refs if len(_[0]) > 0]
	# hypotheses, references = zip(*hyps_and_refs)

	# Calculate ROUGE-1 F1, precision, recall scores
	rouge_1 = [rouge_n([hyp], [ref], 1) for hyp, ref in zip(hypotheses, references)]
	rouge_1_f, rouge_1_p, rouge_1_r = map(np.mean, zip(*rouge_1))

	# Calculate ROUGE-2 F1, precision, recall scores
	rouge_2 = [rouge_n([hyp], [ref], 2) for hyp, ref in zip(hypotheses, references)]
	rouge_2_f, rouge_2_p, rouge_2_r = map(np.mean, zip(*rouge_2))

	# Calculate ROUGE-SU4 F1, precision, recall scores
	rouge_su4 = [rouge_su([hyp], [ref], 4) for hyp, ref in zip(hypotheses, references)]
	rouge_su4_f, rouge_su4_p, rouge_su4_r = map(np.mean, zip(*rouge_su4))

	# Calculate ROUGE-L F1, precision, recall scores
	rouge_l = [
	rouge_l_sentence_level([hyp], [ref]) for hyp, ref in zip(hypotheses, references)
	]
	rouge_l_f, rouge_l_p, rouge_l_r = map(np.mean, zip(*rouge_l))

	return {
	"rouge_1_f_score": rouge_1_f,
	"rouge_2_f_score": rouge_2_f,
	"rouge_su4_f_score": rouge_su4_f,
	"rouge_l_f_score": rouge_l_f,
	}


	class OldROUGEEval:
	def __init__(self):
	pass

	def make_html_safe(self, s):
	s.replace("<", "<")
	s.replace(">", ">")
	return s

	def eval(self, predictions, groundtruths):
	predictions = [self.make_html_safe(w) for w in predictions]
	groundtruths = [self.make_html_safe(w) for w in groundtruths]
	results = rouge(predictions, groundtruths)
	return results