from typing import Dict, List import numpy as np from evaluation.evaluate_utils.utils import _align_bags def calculate_f1_score(precision, recall): if precision + recall == 0: return 0 # Handle the case to avoid division by zero return 2 * (precision * recall) / (precision + recall) def calc_recall(pred: Dict, gold: Dict, use_gold_for_eval: bool): from evaluation.evaluate_utils.evaluate_factory import get_evaluator_from_gold_answer recall = [] for gold_key, gold_value in gold.items(): pred_value = pred.get(gold_key) gold_value = fix_number(gold_value) pred_value = fix_number(pred_value) if gold_key not in pred: recall.append(0) else: evaluator = ( get_evaluator_from_gold_answer(type(gold_value)) if use_gold_for_eval else get_evaluator_from_gold_answer(type(pred_value)) ) if type(pred_value) != type(gold_value): recall.append(0) continue recall.append(evaluator(pred_value, gold_value)) avg_recall = np.average(recall) return avg_recall def fix_number(number): if type(number) == str: copy_ans = number copy_ans = ' '.join(' '.join(' '.join(copy_ans.split('$')).split('%')).split('sqft')).strip() copy_ans = copy_ans.strip() copy_ans = copy_ans.replace(',', '.') try: return float(copy_ans) except: return number elif type(number) == int: return float(number) else: return number def evaluate_pair_of_dicts(pred: Dict, gold: Dict): recall = calc_recall(pred, gold, True) precision = calc_recall(gold, pred, False) f1 = calculate_f1_score(precision, recall) return f1 def evaluate_dicts(pred: List[Dict], gold: List[Dict]): if not ( type(pred) == dict or len(pred) == 0 or (type(pred) == list and type(pred[0]) == dict) ): return 0 max_alignment_scores = _align_bags(pred, gold, evaluate_pair_of_dicts) return np.average(max_alignment_scores)