Binder / utils /mmqa /evaluator.py
Timothyxxx
Add missed MMQA
7de3018
import re
import string
import numpy as np
from collections import Counter
from typing import List, Set, Tuple, Union
from scipy.optimize import linear_sum_assignment
from word2number.w2n import word_to_num
import json
# copy from https://github.com/allenai/multimodalqa/blob/master/baselines/evaluate.py
ALL_QUESTION_TYPES = [
'TextQ',
'TableQ',
'ImageQ',
'ImageListQ',
'Compose(TableQ,ImageListQ)',
'Compose(TextQ,ImageListQ)',
'Compose(ImageQ,TableQ)',
'Compose(ImageQ,TextQ)',
'Compose(TextQ,TableQ)',
'Compose(TableQ,TextQ)',
'Intersect(TableQ,TextQ)',
'Intersect(ImageListQ,TableQ)',
'Intersect(ImageListQ,TextQ)',
'Compare(Compose(TableQ,ImageQ),TableQ)',
'Compare(Compose(TableQ,ImageQ),Compose(TableQ,TextQ))',
'Compare(TableQ,Compose(TableQ,TextQ))',
]
TEXT_SINGLE_HOP_QUESTION_TYPES = [
'TextQ',
]
TEXT_AS_FIRST_HOP_QUESTION_TYPES = [
'Compare(TableQ,Compose(TableQ,TextQ))',
'Compose(ImageQ,TextQ)',
'Compose(TableQ,TextQ)',
'Intersect(TableQ,TextQ)',
'Intersect(ImageListQ,TextQ)',
]
TEXT_AS_SECOND_HOP_QUESTION_TYPES = [
'Compare(Compose(TableQ,ImageQ),Compose(TableQ,TextQ))',
'Compose(TextQ,ImageListQ)',
'Compose(TextQ,TableQ)',
]
TABLE_SINGLE_HOP_QUESTION_TYPES = [
"TableQ"
]
TABLE_AS_FIRST_HOP_QUESTION_TYPES = [
'Compose(ImageQ,TableQ)',
'Compose(TextQ,TableQ)',
]
TABLE_AS_SECOND_HOP_QUESTION_TYPES = [
'Compare(Compose(TableQ,ImageQ),TableQ)',
'Compare(TableQ,Compose(TableQ,TextQ))',
'Compose(TableQ,ImageListQ)',
'Compose(TableQ,TextQ)',
'Intersect(ImageListQ,TableQ)',
'Intersect(TableQ,TextQ)',
]
IMAGE_SINGLE_HOP_QUESTION_TYPES = [
'ImageQ',
'ImageListQ'
]
IMAGE_AS_FIRST_HOP_QUESTION_TYPES = [
'Compare(Compose(TableQ,ImageQ),Compose(TableQ,TextQ))',
'Compare(Compose(TableQ,ImageQ),TableQ)',
'Compose(TableQ,ImageListQ)',
'Compose(TextQ,ImageListQ)',
'Intersect(ImageListQ,TableQ)',
]
IMAGE_AS_SECOND_HOP_QUESTION_TYPES = [
'Compose(ImageQ,TableQ)',
'Compose(ImageQ,TextQ)',
'Intersect(ImageListQ,TextQ)',
]
# every question should be answered either as a single hop question, or two-hop question
assert set(TEXT_SINGLE_HOP_QUESTION_TYPES + TEXT_AS_SECOND_HOP_QUESTION_TYPES
+ TABLE_SINGLE_HOP_QUESTION_TYPES + TABLE_AS_SECOND_HOP_QUESTION_TYPES
+ IMAGE_SINGLE_HOP_QUESTION_TYPES + IMAGE_AS_SECOND_HOP_QUESTION_TYPES) == set(ALL_QUESTION_TYPES)
assert len(set(TEXT_SINGLE_HOP_QUESTION_TYPES) & set(TEXT_AS_SECOND_HOP_QUESTION_TYPES)) == 0
assert len(set(TABLE_SINGLE_HOP_QUESTION_TYPES) & set(TABLE_AS_SECOND_HOP_QUESTION_TYPES)) == 0
assert len(set(IMAGE_SINGLE_HOP_QUESTION_TYPES) & set(IMAGE_AS_SECOND_HOP_QUESTION_TYPES)) == 0
SINGLE_HOP_QUESTION_TYPES = TEXT_SINGLE_HOP_QUESTION_TYPES \
+ TABLE_SINGLE_HOP_QUESTION_TYPES \
+ IMAGE_SINGLE_HOP_QUESTION_TYPES
MULTI_HOP_QUESTION_TYPES = TEXT_AS_SECOND_HOP_QUESTION_TYPES \
+ TABLE_AS_SECOND_HOP_QUESTION_TYPES + \
IMAGE_AS_SECOND_HOP_QUESTION_TYPES
# no duplicated multi-hop question types
assert len(MULTI_HOP_QUESTION_TYPES) == len(set(MULTI_HOP_QUESTION_TYPES))
# no duplication for the first hop
assert set(TEXT_AS_FIRST_HOP_QUESTION_TYPES + TABLE_AS_FIRST_HOP_QUESTION_TYPES + IMAGE_AS_FIRST_HOP_QUESTION_TYPES) \
== set(MULTI_HOP_QUESTION_TYPES)
# single + multi = all
assert set(SINGLE_HOP_QUESTION_TYPES + MULTI_HOP_QUESTION_TYPES) == set(ALL_QUESTION_TYPES)
def process_question_for_implicit_decomp(question, question_type, hop=0, bridge_entity='', sep_token='[SEP]'):
if isinstance(bridge_entity, list) or isinstance(bridge_entity, set):
bridge_entity = "; ".join(bridge_entity)
return (
f'{question_type} {sep_token} '
f'HOP={hop} {sep_token} '
f'{bridge_entity} {sep_token} '
f'{question}')
def extract_numbers_from_str(s):
numbers = []
for token in s.split():
try:
num = int(token.replace(",", ""))
except:
try:
num = float(token)
except:
num = None
if num:
numbers.append(num)
return numbers
def read_jsonl(filename):
with open(filename, 'r') as f:
data = [json.loads(l.strip()) for l in f.readlines()]
return data
# From here through _match_numbers_if_present was originally copied from the evaluation code of DROP dataset:
# https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_rc/eval/drop_eval.py
def _remove_articles(text: str) -> str:
regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
return re.sub(regex, " ", text)
def _white_space_fix(text: str) -> str:
return " ".join(text.split())
EXCLUDE = set(string.punctuation)
def _remove_punc(text: str) -> str:
if not _is_number(text):
return "".join(ch for ch in text if ch not in EXCLUDE)
else:
return text
def _lower(text: str) -> str:
return text.lower()
def _tokenize(text: str) -> List[str]:
return re.split(" |-", text)
def _normalize_answer(text: str) -> str:
"""Lower text and remove punctuation, articles and extra whitespace."""
parts = [
_white_space_fix(_remove_articles(_normalize_number(_remove_punc(_lower(token)))))
for token in _tokenize(text)
]
parts = [part for part in parts if part.strip()]
normalized = " ".join(parts).strip()
return normalized
def _is_number(text: str) -> bool:
try:
float(text)
return True
except ValueError:
return False
def _is_word_number(text: str) -> bool:
try:
word_to_num(text)
return True
except ValueError:
return False
def _normalize_number(text: str) -> str:
if _is_number(text):
return str(float(text))
#TODO: this is not included in the original drop evaluation script, we need to have our own in the end anyways.
elif _is_word_number(text):
return str(float(word_to_num(text)))
else:
return text
def _answer_to_bags(
answer: Union[str, List[str], Tuple[str, ...]]
) -> Tuple[List[str], List[Set[str]]]:
if isinstance(answer, (list, tuple)):
raw_spans = answer
else:
raw_spans = [answer]
normalized_spans: List[str] = []
token_bags = []
for raw_span in raw_spans:
normalized_span = _normalize_answer(raw_span)
normalized_spans.append(normalized_span)
token_bags.append(set(normalized_span.split()))
return normalized_spans, token_bags
def _align_bags(predicted: List[Set[str]], gold: List[Set[str]]) -> List[float]:
"""
Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
between them and gets maximum metric values over all the answers.
"""
scores = np.zeros([len(gold), len(predicted)])
for gold_index, gold_item in enumerate(gold):
for pred_index, pred_item in enumerate(predicted):
if _match_numbers_if_present(gold_item, pred_item):
scores[gold_index, pred_index] = _compute_f1(pred_item, gold_item)
row_ind, col_ind = linear_sum_assignment(-scores)
max_scores = np.zeros([max(len(gold), len(predicted))])
for row, column in zip(row_ind, col_ind):
max_scores[row] = max(max_scores[row], scores[row, column])
return max_scores
def _compute_f1(predicted_bag: Set[str], gold_bag: Set[str]) -> float:
intersection = len(gold_bag.intersection(predicted_bag))
if not predicted_bag:
precision = 1.0
else:
precision = intersection / float(len(predicted_bag))
if not gold_bag:
recall = 1.0
else:
recall = intersection / float(len(gold_bag))
f1 = (
(2 * precision * recall) / (precision + recall)
if not (precision == 0.0 and recall == 0.0)
else 0.0
)
return f1
def _match_numbers_if_present(gold_bag: Set[str], predicted_bag: Set[str]) -> bool:
gold_numbers = set()
predicted_numbers = set()
for word in gold_bag:
if _is_number(word):
gold_numbers.add(word)
for word in predicted_bag:
if _is_number(word):
predicted_numbers.add(word)
if (not gold_numbers) or gold_numbers.intersection(predicted_numbers):
return True
return False
def acc(predicted, gold):
predicted_bags = _answer_to_bags(predicted)
gold_bags = _answer_to_bags(gold)
if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(gold_bags[0]):
return 1.0
else:
return 0.0
def f1(predicted, gold):
predicted_bags = _answer_to_bags(predicted)
gold_bags = _answer_to_bags(gold)
f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
f1 = np.mean(f1_per_bag)
f1 = round(f1, 2)
return f1
def metric_max_over_ground_truths(metric_fn, prediction, gold_answers):
scores_for_ground_truths = []
for gold_answer in gold_answers:
score = metric_fn(prediction, gold_answer)
scores_for_ground_truths.append(score)
return max(scores_for_ground_truths)
def evaluate_predictions(predictions, gold_answers, example_types=None):
"""To support multiple gold annotations, `gold_answers` should be a list,
with each item (either a string or a list) corresponding to one valid reference answer."""
instance_eval_results = {}
instance_eval_results_by_types = {}
eval_funcs = {
"acc": acc,
"f1": f1
}
for qas_id in gold_answers:
ref_answers = gold_answers[qas_id]
if qas_id not in predictions:
print(f"Missing prediction for question {qas_id}, and all scores for this question are set to zero")
instance_eval_results[qas_id] = {
metric: 0.0 for metric in eval_funcs.keys()
}
else:
pred_answer = predictions[qas_id]
instance_eval_results[qas_id] = {
metric: metric_max_over_ground_truths(
func, pred_answer, ref_answers
) for metric, func in eval_funcs.items()
}
if example_types is not None:
example_type = example_types[qas_id]
if example_type not in instance_eval_results_by_types:
instance_eval_results_by_types[example_type] = {}
instance_eval_results_by_types[example_type][qas_id] = instance_eval_results[qas_id]
eval_scores = {metric: np.mean([result[metric] for result in instance_eval_results.values()])
for metric in eval_funcs.keys()}
if example_types is not None:
eval_scores_by_types = {}
for example_type, type_instance_eval_results in instance_eval_results_by_types.items():
eval_scores_by_types[example_type] = {
metric: np.mean([result[metric] for result in type_instance_eval_results.values()]) for metric in eval_funcs.keys()
}
return eval_scores, instance_eval_results, eval_scores_by_types
else:
return eval_scores, instance_eval_results
def evaluate_prediction_file(prediction_path, gold_path):
predicted_answers = json.load(open(prediction_path, encoding="utf-8"))
examples = read_jsonl(gold_path)
gold_answers, answer_modalities, hop_types, question_types = {}, {}, {}, {}
for example in examples:
qid = example["qid"]
# Currently we only have one ground truth answer.
# Even if there are multiple entries in example["answers"], the whole list should be regarded as one ref answer.
# However, our script supports evaluation with multiple ref answers.
# So, we will use an outer bracket here to pretend we have a list of ref answers.
gold_answer = [str(item["answer"]) for item in example["answers"]]
gold_answers[qid] = [gold_answer]
answer_modality = set([item["modality"] for item in example["answers"]])
assert len(answer_modality) == 1
answer_modalities[qid] = answer_modality.pop()
question_types[qid] = example["metadata"]["type"]
hop_types[qid] = "Multi-hop" if example["metadata"]["type"] in MULTI_HOP_QUESTION_TYPES else "Single-hop"
eval_scores, instance_eval_results = evaluate_predictions(predicted_answers, gold_answers)
print("\n\nOverall result with different metrics: ")
for metric, value in eval_scores.items():
print(f"{metric}: {value}")
modality_counts = Counter(answer_modalities.values())
_, _, eval_scores_by_modalities = \
evaluate_predictions(predicted_answers, gold_answers, answer_modalities)
print("\n\nEval results for different modalities:")
for answer_modality in sorted(eval_scores_by_modalities.keys()):
result = eval_scores_by_modalities[answer_modality]
print(f"{answer_modality}")
print(f"# of examples: {modality_counts[answer_modality]}")
for metric, value in result.items():
print(f"{metric}: {value}")
hop_type_counts = Counter(hop_types.values())
_, _, eval_scores_by_hop_types = evaluate_predictions(predicted_answers, gold_answers, hop_types)
print("\n\nType\tCount\tEM\tF1")
for hop_type in sorted(eval_scores_by_hop_types.keys()):
result = eval_scores_by_hop_types[hop_type]
print(f"{hop_type}\t{hop_type_counts[hop_type]}\t{result['acc']}\t{result['f1']}")
question_type_counts = Counter(question_types.values())
_, _, eval_scores_by_qtypes = evaluate_predictions(predicted_answers, gold_answers, question_types)
print("\n\nType\tCount\tEM\tF1")
for question_type in sorted(eval_scores_by_qtypes.keys()):
result = eval_scores_by_qtypes[question_type]
print(f"{question_type}\t{question_type_counts[question_type]}\t{result['acc']}\t{result['f1']}")
return eval_scores
class EvaluateTool(object):
def __init__(self, args):
self.args = args
def evaluate(self, preds, golds, section):
summary = {}
gold_answers, predicted_answers = {}, {}
for pred, gold in zip(preds, golds):
qid = gold["id"]
gold_answer = [item.strip() for item in gold["answer_text"].split("|")]
gold_answers[qid] = [gold_answer]
predicted_answers[qid] = [item.strip() for item in pred.split("|")]
eval_scores, instance_eval_results = evaluate_predictions(predicted_answers, gold_answers)
for metric, value in eval_scores.items():
summary[metric] = value
return summary