Spaces:

hkunlp
/

Binder

Runtime error

Timothyxxx

Add missed MMQA

7de3018 over 2 years ago

14.6 kB

	import re
	import string
	import numpy as np
	from collections import Counter
	from typing import List, Set, Tuple, Union
	from scipy.optimize import linear_sum_assignment
	from word2number.w2n import word_to_num
	import json

	# copy from https://github.com/allenai/multimodalqa/blob/master/baselines/evaluate.py


	ALL_QUESTION_TYPES = [
	'TextQ',
	'TableQ',
	'ImageQ',
	'ImageListQ',
	'Compose(TableQ,ImageListQ)',
	'Compose(TextQ,ImageListQ)',
	'Compose(ImageQ,TableQ)',
	'Compose(ImageQ,TextQ)',
	'Compose(TextQ,TableQ)',
	'Compose(TableQ,TextQ)',
	'Intersect(TableQ,TextQ)',
	'Intersect(ImageListQ,TableQ)',
	'Intersect(ImageListQ,TextQ)',
	'Compare(Compose(TableQ,ImageQ),TableQ)',
	'Compare(Compose(TableQ,ImageQ),Compose(TableQ,TextQ))',
	'Compare(TableQ,Compose(TableQ,TextQ))',
	]

	TEXT_SINGLE_HOP_QUESTION_TYPES = [
	'TextQ',
	]
	TEXT_AS_FIRST_HOP_QUESTION_TYPES = [
	'Compare(TableQ,Compose(TableQ,TextQ))',
	'Compose(ImageQ,TextQ)',
	'Compose(TableQ,TextQ)',
	'Intersect(TableQ,TextQ)',
	'Intersect(ImageListQ,TextQ)',
	]
	TEXT_AS_SECOND_HOP_QUESTION_TYPES = [
	'Compare(Compose(TableQ,ImageQ),Compose(TableQ,TextQ))',
	'Compose(TextQ,ImageListQ)',
	'Compose(TextQ,TableQ)',
	]

	TABLE_SINGLE_HOP_QUESTION_TYPES = [
	"TableQ"
	]
	TABLE_AS_FIRST_HOP_QUESTION_TYPES = [
	'Compose(ImageQ,TableQ)',
	'Compose(TextQ,TableQ)',
	]
	TABLE_AS_SECOND_HOP_QUESTION_TYPES = [
	'Compare(Compose(TableQ,ImageQ),TableQ)',
	'Compare(TableQ,Compose(TableQ,TextQ))',
	'Compose(TableQ,ImageListQ)',
	'Compose(TableQ,TextQ)',
	'Intersect(ImageListQ,TableQ)',
	'Intersect(TableQ,TextQ)',
	]

	IMAGE_SINGLE_HOP_QUESTION_TYPES = [
	'ImageQ',
	'ImageListQ'
	]
	IMAGE_AS_FIRST_HOP_QUESTION_TYPES = [
	'Compare(Compose(TableQ,ImageQ),Compose(TableQ,TextQ))',
	'Compare(Compose(TableQ,ImageQ),TableQ)',
	'Compose(TableQ,ImageListQ)',
	'Compose(TextQ,ImageListQ)',
	'Intersect(ImageListQ,TableQ)',
	]
	IMAGE_AS_SECOND_HOP_QUESTION_TYPES = [
	'Compose(ImageQ,TableQ)',
	'Compose(ImageQ,TextQ)',
	'Intersect(ImageListQ,TextQ)',
	]


	# every question should be answered either as a single hop question, or two-hop question
	assert set(TEXT_SINGLE_HOP_QUESTION_TYPES + TEXT_AS_SECOND_HOP_QUESTION_TYPES
	+ TABLE_SINGLE_HOP_QUESTION_TYPES + TABLE_AS_SECOND_HOP_QUESTION_TYPES
	+ IMAGE_SINGLE_HOP_QUESTION_TYPES + IMAGE_AS_SECOND_HOP_QUESTION_TYPES) == set(ALL_QUESTION_TYPES)
	assert len(set(TEXT_SINGLE_HOP_QUESTION_TYPES) & set(TEXT_AS_SECOND_HOP_QUESTION_TYPES)) == 0
	assert len(set(TABLE_SINGLE_HOP_QUESTION_TYPES) & set(TABLE_AS_SECOND_HOP_QUESTION_TYPES)) == 0
	assert len(set(IMAGE_SINGLE_HOP_QUESTION_TYPES) & set(IMAGE_AS_SECOND_HOP_QUESTION_TYPES)) == 0

	SINGLE_HOP_QUESTION_TYPES = TEXT_SINGLE_HOP_QUESTION_TYPES \
	+ TABLE_SINGLE_HOP_QUESTION_TYPES \
	+ IMAGE_SINGLE_HOP_QUESTION_TYPES
	MULTI_HOP_QUESTION_TYPES = TEXT_AS_SECOND_HOP_QUESTION_TYPES \
	+ TABLE_AS_SECOND_HOP_QUESTION_TYPES + \
	IMAGE_AS_SECOND_HOP_QUESTION_TYPES
	# no duplicated multi-hop question types
	assert len(MULTI_HOP_QUESTION_TYPES) == len(set(MULTI_HOP_QUESTION_TYPES))
	# no duplication for the first hop
	assert set(TEXT_AS_FIRST_HOP_QUESTION_TYPES + TABLE_AS_FIRST_HOP_QUESTION_TYPES + IMAGE_AS_FIRST_HOP_QUESTION_TYPES) \
	== set(MULTI_HOP_QUESTION_TYPES)
	# single + multi = all
	assert set(SINGLE_HOP_QUESTION_TYPES + MULTI_HOP_QUESTION_TYPES) == set(ALL_QUESTION_TYPES)


	def process_question_for_implicit_decomp(question, question_type, hop=0, bridge_entity='', sep_token='[SEP]'):
	if isinstance(bridge_entity, list) or isinstance(bridge_entity, set):
	bridge_entity = "; ".join(bridge_entity)
	return (
	f'{question_type} {sep_token} '
	f'HOP={hop} {sep_token} '
	f'{bridge_entity} {sep_token} '
	f'{question}')


	def extract_numbers_from_str(s):
	numbers = []
	for token in s.split():
	try:
	num = int(token.replace(",", ""))
	except:
	try:
	num = float(token)
	except:
	num = None
	if num:
	numbers.append(num)
	return numbers


	def read_jsonl(filename):
	with open(filename, 'r') as f:
	data = [json.loads(l.strip()) for l in f.readlines()]
	return data

	# From here through _match_numbers_if_present was originally copied from the evaluation code of DROP dataset:
	# https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_rc/eval/drop_eval.py

	def _remove_articles(text: str) -> str:
	regex = re.compile(r"\b(a\|an\|the)\b", re.UNICODE)
	return re.sub(regex, " ", text)


	def _white_space_fix(text: str) -> str:
	return " ".join(text.split())


	EXCLUDE = set(string.punctuation)


	def _remove_punc(text: str) -> str:
	if not _is_number(text):
	return "".join(ch for ch in text if ch not in EXCLUDE)
	else:
	return text


	def _lower(text: str) -> str:
	return text.lower()


	def _tokenize(text: str) -> List[str]:
	return re.split(" \|-", text)


	def _normalize_answer(text: str) -> str:
	"""Lower text and remove punctuation, articles and extra whitespace."""

	parts = [
	_white_space_fix(_remove_articles(_normalize_number(_remove_punc(_lower(token)))))
	for token in _tokenize(text)
	]
	parts = [part for part in parts if part.strip()]
	normalized = " ".join(parts).strip()
	return normalized


	def _is_number(text: str) -> bool:
	try:
	float(text)
	return True
	except ValueError:
	return False


	def _is_word_number(text: str) -> bool:
	try:
	word_to_num(text)
	return True
	except ValueError:
	return False


	def _normalize_number(text: str) -> str:
	if _is_number(text):
	return str(float(text))
	#TODO: this is not included in the original drop evaluation script, we need to have our own in the end anyways.
	elif _is_word_number(text):
	return str(float(word_to_num(text)))
	else:
	return text


	def _answer_to_bags(
	answer: Union[str, List[str], Tuple[str, ...]]
	) -> Tuple[List[str], List[Set[str]]]:
	if isinstance(answer, (list, tuple)):
	raw_spans = answer
	else:
	raw_spans = [answer]
	normalized_spans: List[str] = []
	token_bags = []
	for raw_span in raw_spans:
	normalized_span = _normalize_answer(raw_span)
	normalized_spans.append(normalized_span)
	token_bags.append(set(normalized_span.split()))
	return normalized_spans, token_bags


	def _align_bags(predicted: List[Set[str]], gold: List[Set[str]]) -> List[float]:
	"""
	Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
	between them and gets maximum metric values over all the answers.
	"""
	scores = np.zeros([len(gold), len(predicted)])
	for gold_index, gold_item in enumerate(gold):
	for pred_index, pred_item in enumerate(predicted):
	if _match_numbers_if_present(gold_item, pred_item):
	scores[gold_index, pred_index] = _compute_f1(pred_item, gold_item)
	row_ind, col_ind = linear_sum_assignment(-scores)

	max_scores = np.zeros([max(len(gold), len(predicted))])
	for row, column in zip(row_ind, col_ind):
	max_scores[row] = max(max_scores[row], scores[row, column])
	return max_scores


	def _compute_f1(predicted_bag: Set[str], gold_bag: Set[str]) -> float:
	intersection = len(gold_bag.intersection(predicted_bag))
	if not predicted_bag:
	precision = 1.0
	else:
	precision = intersection / float(len(predicted_bag))
	if not gold_bag:
	recall = 1.0
	else:
	recall = intersection / float(len(gold_bag))
	f1 = (
	(2 * precision * recall) / (precision + recall)
	if not (precision == 0.0 and recall == 0.0)
	else 0.0
	)
	return f1


	def _match_numbers_if_present(gold_bag: Set[str], predicted_bag: Set[str]) -> bool:
	gold_numbers = set()
	predicted_numbers = set()
	for word in gold_bag:
	if _is_number(word):
	gold_numbers.add(word)
	for word in predicted_bag:
	if _is_number(word):
	predicted_numbers.add(word)
	if (not gold_numbers) or gold_numbers.intersection(predicted_numbers):
	return True
	return False



	def acc(predicted, gold):
	predicted_bags = _answer_to_bags(predicted)
	gold_bags = _answer_to_bags(gold)
	if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(gold_bags[0]):
	return 1.0
	else:
	return 0.0


	def f1(predicted, gold):
	predicted_bags = _answer_to_bags(predicted)
	gold_bags = _answer_to_bags(gold)
	f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
	f1 = np.mean(f1_per_bag)
	f1 = round(f1, 2)
	return f1


	def metric_max_over_ground_truths(metric_fn, prediction, gold_answers):
	scores_for_ground_truths = []
	for gold_answer in gold_answers:
	score = metric_fn(prediction, gold_answer)
	scores_for_ground_truths.append(score)
	return max(scores_for_ground_truths)


	def evaluate_predictions(predictions, gold_answers, example_types=None):
	"""To support multiple gold annotations, `gold_answers` should be a list,
	with each item (either a string or a list) corresponding to one valid reference answer."""
	instance_eval_results = {}
	instance_eval_results_by_types = {}
	eval_funcs = {
	"acc": acc,
	"f1": f1
	}
	for qas_id in gold_answers:
	ref_answers = gold_answers[qas_id]
	if qas_id not in predictions:
	print(f"Missing prediction for question {qas_id}, and all scores for this question are set to zero")
	instance_eval_results[qas_id] = {
	metric: 0.0 for metric in eval_funcs.keys()
	}
	else:
	pred_answer = predictions[qas_id]
	instance_eval_results[qas_id] = {
	metric: metric_max_over_ground_truths(
	func, pred_answer, ref_answers
	) for metric, func in eval_funcs.items()
	}
	if example_types is not None:
	example_type = example_types[qas_id]
	if example_type not in instance_eval_results_by_types:
	instance_eval_results_by_types[example_type] = {}
	instance_eval_results_by_types[example_type][qas_id] = instance_eval_results[qas_id]

	eval_scores = {metric: np.mean([result[metric] for result in instance_eval_results.values()])
	for metric in eval_funcs.keys()}

	if example_types is not None:
	eval_scores_by_types = {}
	for example_type, type_instance_eval_results in instance_eval_results_by_types.items():
	eval_scores_by_types[example_type] = {
	metric: np.mean([result[metric] for result in type_instance_eval_results.values()]) for metric in eval_funcs.keys()
	}
	return eval_scores, instance_eval_results, eval_scores_by_types
	else:
	return eval_scores, instance_eval_results


	def evaluate_prediction_file(prediction_path, gold_path):
	predicted_answers = json.load(open(prediction_path, encoding="utf-8"))
	examples = read_jsonl(gold_path)
	gold_answers, answer_modalities, hop_types, question_types = {}, {}, {}, {}
	for example in examples:
	qid = example["qid"]
	# Currently we only have one ground truth answer.
	# Even if there are multiple entries in example["answers"], the whole list should be regarded as one ref answer.
	# However, our script supports evaluation with multiple ref answers.
	# So, we will use an outer bracket here to pretend we have a list of ref answers.
	gold_answer = [str(item["answer"]) for item in example["answers"]]
	gold_answers[qid] = [gold_answer]
	answer_modality = set([item["modality"] for item in example["answers"]])
	assert len(answer_modality) == 1
	answer_modalities[qid] = answer_modality.pop()
	question_types[qid] = example["metadata"]["type"]
	hop_types[qid] = "Multi-hop" if example["metadata"]["type"] in MULTI_HOP_QUESTION_TYPES else "Single-hop"

	eval_scores, instance_eval_results = evaluate_predictions(predicted_answers, gold_answers)
	print("\n\nOverall result with different metrics: ")
	for metric, value in eval_scores.items():
	print(f"{metric}: {value}")

	modality_counts = Counter(answer_modalities.values())
	_, _, eval_scores_by_modalities = \
	evaluate_predictions(predicted_answers, gold_answers, answer_modalities)
	print("\n\nEval results for different modalities:")
	for answer_modality in sorted(eval_scores_by_modalities.keys()):
	result = eval_scores_by_modalities[answer_modality]
	print(f"{answer_modality}")
	print(f"# of examples: {modality_counts[answer_modality]}")
	for metric, value in result.items():
	print(f"{metric}: {value}")

	hop_type_counts = Counter(hop_types.values())
	_, _, eval_scores_by_hop_types = evaluate_predictions(predicted_answers, gold_answers, hop_types)
	print("\n\nType\tCount\tEM\tF1")
	for hop_type in sorted(eval_scores_by_hop_types.keys()):
	result = eval_scores_by_hop_types[hop_type]
	print(f"{hop_type}\t{hop_type_counts[hop_type]}\t{result['acc']}\t{result['f1']}")

	question_type_counts = Counter(question_types.values())
	_, _, eval_scores_by_qtypes = evaluate_predictions(predicted_answers, gold_answers, question_types)
	print("\n\nType\tCount\tEM\tF1")
	for question_type in sorted(eval_scores_by_qtypes.keys()):
	result = eval_scores_by_qtypes[question_type]
	print(f"{question_type}\t{question_type_counts[question_type]}\t{result['acc']}\t{result['f1']}")
	return eval_scores


	class EvaluateTool(object):

	def __init__(self, args):
	self.args = args

	def evaluate(self, preds, golds, section):
	summary = {}

	gold_answers, predicted_answers = {}, {}
	for pred, gold in zip(preds, golds):
	qid = gold["id"]
	gold_answer = [item.strip() for item in gold["answer_text"].split("\|")]
	gold_answers[qid] = [gold_answer]
	predicted_answers[qid] = [item.strip() for item in pred.split("\|")]

	eval_scores, instance_eval_results = evaluate_predictions(predicted_answers, gold_answers)

	for metric, value in eval_scores.items():
	summary[metric] = value
	return summary