Spaces:

AssistantBench
/

leaderboard

Running

App Files Files Community

leaderboard / evaluation /evaluate_utils /evaluate_strings.py

samuelam

Upload 6 files

3891395 verified 3 months ago

raw

history blame

5.54 kB

	"""
	Evaluation for two strings or list of strings.

	Code taken from the DROP benchmark - https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_rc/eval/drop_eval.py
	"""

	from collections import defaultdict
	from typing import List, Set, Tuple, Union
	import string
	import re
	import numpy as np
	from scipy.optimize import linear_sum_assignment


	# From here through _normalize_answer was originally copied from:
	# https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/
	# Then cleaned up and modified a bit.
	def _remove_articles(text: str) -> str:
	regex = re.compile(r"\b(a\|an\|the)\b", re.UNICODE)
	return re.sub(regex, " ", text)


	def _white_space_fix(text: str) -> str:
	return " ".join(text.split())


	EXCLUDE = set(string.punctuation)


	def _remove_punc(text: str) -> str:
	if not _is_number(text):
	return "".join(ch for ch in text if ch not in EXCLUDE)
	else:
	return text


	def _lower(text: str) -> str:
	return text.lower()


	def _tokenize(text: str) -> List[str]:
	return re.split(" \|-", text)


	def _normalize_answer(text: str) -> str:
	"""Lower text and remove punctuation, articles and extra whitespace."""

	parts = [
	_white_space_fix(
	_remove_articles(_normalize_number(_remove_punc(_lower(token))))
	)
	for token in _tokenize(text)
	]
	parts = [part for part in parts if part.strip()]
	normalized = " ".join(parts).strip()
	return normalized


	def _is_number(text: str) -> bool:
	try:
	float(text)
	return True
	except ValueError:
	return False


	def _normalize_number(text: str) -> str:
	if _is_number(text):
	return str(float(text))
	else:
	return text


	def _answer_to_bags(
	answer: Union[str, List[str], Tuple[str, ...]]
	) -> Tuple[List[str], List[Set[str]]]:
	if isinstance(answer, (list, tuple)):
	raw_spans = answer
	else:
	raw_spans = [answer]
	normalized_spans: List[str] = []
	token_bags = []
	for raw_span in raw_spans:
	normalized_span = _normalize_answer(raw_span)
	normalized_spans.append(normalized_span)
	token_bags.append(set(normalized_span.split()))
	return normalized_spans, token_bags


	def _align_bags(predicted: List[Set[str]], gold: List[Set[str]]) -> List[float]:
	"""
	Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
	between them and gets maximum metric values over all the answers.
	"""
	scores = np.zeros([len(gold), len(predicted)])
	for gold_index, gold_item in enumerate(gold):
	for pred_index, pred_item in enumerate(predicted):
	if _match_numbers_if_present(gold_item, pred_item):
	scores[gold_index, pred_index] = _compute_f1(pred_item, gold_item)
	row_ind, col_ind = linear_sum_assignment(-scores)

	max_scores = np.zeros([max(len(gold), len(predicted))])
	for row, column in zip(row_ind, col_ind):
	max_scores[row] = max(max_scores[row], scores[row, column])
	return max_scores


	def _compute_f1(predicted_bag: Set[str], gold_bag: Set[str]) -> float:
	intersection = len(gold_bag.intersection(predicted_bag))
	if not predicted_bag:
	precision = 1.0
	else:
	precision = intersection / float(len(predicted_bag))
	if not gold_bag:
	recall = 1.0
	else:
	recall = intersection / float(len(gold_bag))
	f1 = (
	(2 * precision * recall) / (precision + recall)
	if not (precision == 0.0 and recall == 0.0)
	else 0.0
	)
	return f1


	def _match_numbers_if_present(gold_bag: Set[str], predicted_bag: Set[str]) -> bool:
	gold_numbers = set()
	predicted_numbers = set()
	for word in gold_bag:
	if _is_number(word):
	gold_numbers.add(word)
	for word in predicted_bag:
	if _is_number(word):
	predicted_numbers.add(word)
	if (not gold_numbers) or gold_numbers.intersection(predicted_numbers):
	return True
	return False


	def get_metrics(
	predicted: Union[str, List[str], Tuple[str, ...]],
	gold: Union[str, List[str], Tuple[str, ...]],
	) -> Tuple[float, float]:
	"""
	Takes a predicted answer and a gold answer (that are both either a string or a list of
	strings), and returns exact match and the DROP F1 metric for the prediction. If you are
	writing a script for evaluating objects in memory (say, the output of predictions during
	validation, or while training), this is the function you want to call, after using
	:func:`answer_json_to_strings` when reading the gold answer from the released data file.
	"""
	predicted_bags = _answer_to_bags(predicted)
	gold_bags = _answer_to_bags(gold)

	if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(
	gold_bags[0]
	):
	exact_match = 1.0
	else:
	exact_match = 0.0

	f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
	f1 = np.mean(f1_per_bag)
	f1 = round(f1, 2)
	return exact_match, f1


	def evaluate_strings(prediction, gold):
	if type(prediction) != list and type(prediction) != str:
	prediction = str(prediction)
	if type(gold) != list and type(gold) != str:
	gold = str(gold)
	try:
	predicted_bags = _answer_to_bags(prediction)
	gold_bags = _answer_to_bags(gold)
	f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
	f1 = np.mean(f1_per_bag)
	except Exception:
	f1 = 0.0
	return f1