""" Evaluation for two strings or list of strings. Code taken from the DROP benchmark - https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_rc/eval/drop_eval.py """ from collections import defaultdict from typing import List, Set, Tuple, Union import string import re import numpy as np from scipy.optimize import linear_sum_assignment # From here through _normalize_answer was originally copied from: # https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/ # Then cleaned up and modified a bit. def _remove_articles(text: str) -> str: regex = re.compile(r"\b(a|an|the)\b", re.UNICODE) return re.sub(regex, " ", text) def _white_space_fix(text: str) -> str: return " ".join(text.split()) EXCLUDE = set(string.punctuation) def _remove_punc(text: str) -> str: if not _is_number(text): return "".join(ch for ch in text if ch not in EXCLUDE) else: return text def _lower(text: str) -> str: return text.lower() def _tokenize(text: str) -> List[str]: return re.split(" |-", text) def _normalize_answer(text: str) -> str: """Lower text and remove punctuation, articles and extra whitespace.""" parts = [ _white_space_fix( _remove_articles(_normalize_number(_remove_punc(_lower(token)))) ) for token in _tokenize(text) ] parts = [part for part in parts if part.strip()] normalized = " ".join(parts).strip() return normalized def _is_number(text: str) -> bool: try: float(text) return True except ValueError: return False def _normalize_number(text: str) -> str: if _is_number(text): return str(float(text)) else: return text def _answer_to_bags( answer: Union[str, List[str], Tuple[str, ...]] ) -> Tuple[List[str], List[Set[str]]]: if isinstance(answer, (list, tuple)): raw_spans = answer else: raw_spans = [answer] normalized_spans: List[str] = [] token_bags = [] for raw_span in raw_spans: normalized_span = _normalize_answer(raw_span) normalized_spans.append(normalized_span) token_bags.append(set(normalized_span.split())) return normalized_spans, token_bags def _align_bags(predicted: List[Set[str]], gold: List[Set[str]]) -> List[float]: """ Takes gold and predicted answer sets and first finds the optimal 1-1 alignment between them and gets maximum metric values over all the answers. """ scores = np.zeros([len(gold), len(predicted)]) for gold_index, gold_item in enumerate(gold): for pred_index, pred_item in enumerate(predicted): if _match_numbers_if_present(gold_item, pred_item): scores[gold_index, pred_index] = _compute_f1(pred_item, gold_item) row_ind, col_ind = linear_sum_assignment(-scores) max_scores = np.zeros([max(len(gold), len(predicted))]) for row, column in zip(row_ind, col_ind): max_scores[row] = max(max_scores[row], scores[row, column]) return max_scores def _compute_f1(predicted_bag: Set[str], gold_bag: Set[str]) -> float: intersection = len(gold_bag.intersection(predicted_bag)) if not predicted_bag: precision = 1.0 else: precision = intersection / float(len(predicted_bag)) if not gold_bag: recall = 1.0 else: recall = intersection / float(len(gold_bag)) f1 = ( (2 * precision * recall) / (precision + recall) if not (precision == 0.0 and recall == 0.0) else 0.0 ) return f1 def _match_numbers_if_present(gold_bag: Set[str], predicted_bag: Set[str]) -> bool: gold_numbers = set() predicted_numbers = set() for word in gold_bag: if _is_number(word): gold_numbers.add(word) for word in predicted_bag: if _is_number(word): predicted_numbers.add(word) if (not gold_numbers) or gold_numbers.intersection(predicted_numbers): return True return False def get_metrics( predicted: Union[str, List[str], Tuple[str, ...]], gold: Union[str, List[str], Tuple[str, ...]], ) -> Tuple[float, float]: """ Takes a predicted answer and a gold answer (that are both either a string or a list of strings), and returns exact match and the DROP F1 metric for the prediction. If you are writing a script for evaluating objects in memory (say, the output of predictions during validation, or while training), this is the function you want to call, after using :func:`answer_json_to_strings` when reading the gold answer from the released data file. """ predicted_bags = _answer_to_bags(predicted) gold_bags = _answer_to_bags(gold) if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len( gold_bags[0] ): exact_match = 1.0 else: exact_match = 0.0 f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1]) f1 = np.mean(f1_per_bag) f1 = round(f1, 2) return exact_match, f1 def evaluate_strings(prediction, gold): if type(prediction) != list and type(prediction) != str: prediction = str(prediction) if type(gold) != list and type(gold) != str: gold = str(gold) try: predicted_bags = _answer_to_bags(prediction) gold_bags = _answer_to_bags(gold) f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1]) f1 = np.mean(f1_per_bag) except Exception: f1 = 0.0 return f1