File size: 4,203 Bytes
54fa0c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import json

TAGS = ['EMAIL', 'IP_ADDRESS', 'KEY']


def load_json(sample):
    try:
        return json.loads(sample)
    except ValueError:
        return []


def overlapped(a, b, alpha=0.8, beta=0.8):
    """Returns True if the intervals a and b overlap for more than 80% of their lengths"""
    size_overlap = max(0, min(a[1], b[1]) - max(a[0], b[0]))
    ref_overlap = size_overlap / (b[1] - b[0])
    pred_overlap = size_overlap / (a[1] - a[0])
    return (ref_overlap > alpha and pred_overlap > beta)


def compare_intervals(references, predictions, alpha=0.8, beta=0.8):
    """Compare two lists of intervals and return the number 
    of true positives, false positives and false negatives.
    >>> compare_intervals([(0, 7), (10, 20)], [(1,8), (99, 119)], 0, 0)[0]
    {'TP': 1, 'FN': 1, 'FP': 1}
    """
    ref_intervals = sorted(references, key=lambda x: x[0])
    pred_intervals = sorted(predictions, key=lambda x: x[0])
    scores = {"TP": 0, "FN": 0, "FP": 0}
    detected_secrets = []
    for interval in pred_intervals:
        for target in ref_intervals:
            if overlapped(interval, target, alpha, beta):
                # the prediction is a true positive
                scores["TP"] += 1
                detected_secrets.append(interval)
                break
        else:
            # the prediction is a false positive
            scores["FP"] += 1
    # the rest of the targets that aren't detected are false negatives
    scores["FN"] += len(ref_intervals) - len(detected_secrets)
    return scores, detected_secrets


def recall_precision(metrics_dict):
    """Compute recall and precision for each tag"""
    metrics = {}
    for tag in TAGS:
        metrics[tag] = {}
        total = metrics_dict[tag]['TP'] + metrics_dict[tag]['FN'] + metrics_dict[tag]['FP']
        if total:
            if not (metrics_dict[tag]['TP'] + metrics_dict[tag]['FN']) or not (metrics_dict[tag]['TP'] + metrics_dict[tag]['FP']):
                # handle division by zero
                metrics[tag] = {'recall': 0, 'precision': 0}
            else:
                metrics[tag]['recall'] = metrics_dict[tag]['TP'] / (metrics_dict[tag]['TP'] + metrics_dict[tag]['FN'])
                metrics[tag]['precision'] = metrics_dict[tag]['TP'] / (metrics_dict[tag]['TP'] + metrics_dict[tag]['FP'])
        else:
            # if there are no annotations, the score is 1
            metrics[tag] = {'recall': 1.0, 'precision': 1.0}
    return metrics


def recall_precision_all_tags(metrics_dict):
    """Compute recall and precision for all tags"""
    metrics = {}
    TP = sum([metrics_dict[tag]['TP'] for tag in TAGS])
    FN = sum([metrics_dict[tag]['FN'] for tag in TAGS])
    FP = sum([metrics_dict[tag]['FP'] for tag in TAGS])
    if not (TP + FN) or not (TP + FP):
        metrics = {'recall': 0, 'precision': 0}
    else:
        metrics['recall'] = TP / (TP + FN)
        metrics['precision'] = TP / (TP + FP)
    return metrics


def evaluate_pii(references, predictions, alpha=0.8, beta=0.8):
    """Evaluate predictions of PII against references"""
    metrics_dict = {}
    for tag in TAGS:
        ref_intervals = [(e['start'], e['end']) for e in references if e['tag'] == tag]
        pred_intervals = [(e['start'], e['end']) for e in predictions if e['tag'] == tag]
        metrics, _ = compare_intervals(ref_intervals, pred_intervals, alpha, beta)
        metrics_dict[tag] = metrics
    return metrics_dict


def evaluate_pii_ds(dataset, pred_column='pii', ref_column="secrets", overall_score=False, alpha=0.8, beta=0.8):
    """Evaluate predictions of PII against references in a dataset
    """
    metrics_dict = {tag: {'TP': 0, 'FN': 0, 'FP': 0} for tag in TAGS}
    for i in range(len(dataset)):
        ref_list = load_json(dataset[i][ref_column])
        pred_list = load_json(dataset[i][pred_column])
        sample_metrics = evaluate_pii(ref_list, pred_list, alpha, beta)
        for tag in TAGS:
            for metric in metrics_dict[tag]:
                metrics_dict[tag][metric] += sample_metrics[tag][metric]
    if overall_score:
        return recall_precision_all_tags(metrics_dict), metrics_dict
    return recall_precision(metrics_dict), metrics_dict