File size: 4,203 Bytes
54fa0c8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import json
TAGS = ['EMAIL', 'IP_ADDRESS', 'KEY']
def load_json(sample):
try:
return json.loads(sample)
except ValueError:
return []
def overlapped(a, b, alpha=0.8, beta=0.8):
"""Returns True if the intervals a and b overlap for more than 80% of their lengths"""
size_overlap = max(0, min(a[1], b[1]) - max(a[0], b[0]))
ref_overlap = size_overlap / (b[1] - b[0])
pred_overlap = size_overlap / (a[1] - a[0])
return (ref_overlap > alpha and pred_overlap > beta)
def compare_intervals(references, predictions, alpha=0.8, beta=0.8):
"""Compare two lists of intervals and return the number
of true positives, false positives and false negatives.
>>> compare_intervals([(0, 7), (10, 20)], [(1,8), (99, 119)], 0, 0)[0]
{'TP': 1, 'FN': 1, 'FP': 1}
"""
ref_intervals = sorted(references, key=lambda x: x[0])
pred_intervals = sorted(predictions, key=lambda x: x[0])
scores = {"TP": 0, "FN": 0, "FP": 0}
detected_secrets = []
for interval in pred_intervals:
for target in ref_intervals:
if overlapped(interval, target, alpha, beta):
# the prediction is a true positive
scores["TP"] += 1
detected_secrets.append(interval)
break
else:
# the prediction is a false positive
scores["FP"] += 1
# the rest of the targets that aren't detected are false negatives
scores["FN"] += len(ref_intervals) - len(detected_secrets)
return scores, detected_secrets
def recall_precision(metrics_dict):
"""Compute recall and precision for each tag"""
metrics = {}
for tag in TAGS:
metrics[tag] = {}
total = metrics_dict[tag]['TP'] + metrics_dict[tag]['FN'] + metrics_dict[tag]['FP']
if total:
if not (metrics_dict[tag]['TP'] + metrics_dict[tag]['FN']) or not (metrics_dict[tag]['TP'] + metrics_dict[tag]['FP']):
# handle division by zero
metrics[tag] = {'recall': 0, 'precision': 0}
else:
metrics[tag]['recall'] = metrics_dict[tag]['TP'] / (metrics_dict[tag]['TP'] + metrics_dict[tag]['FN'])
metrics[tag]['precision'] = metrics_dict[tag]['TP'] / (metrics_dict[tag]['TP'] + metrics_dict[tag]['FP'])
else:
# if there are no annotations, the score is 1
metrics[tag] = {'recall': 1.0, 'precision': 1.0}
return metrics
def recall_precision_all_tags(metrics_dict):
"""Compute recall and precision for all tags"""
metrics = {}
TP = sum([metrics_dict[tag]['TP'] for tag in TAGS])
FN = sum([metrics_dict[tag]['FN'] for tag in TAGS])
FP = sum([metrics_dict[tag]['FP'] for tag in TAGS])
if not (TP + FN) or not (TP + FP):
metrics = {'recall': 0, 'precision': 0}
else:
metrics['recall'] = TP / (TP + FN)
metrics['precision'] = TP / (TP + FP)
return metrics
def evaluate_pii(references, predictions, alpha=0.8, beta=0.8):
"""Evaluate predictions of PII against references"""
metrics_dict = {}
for tag in TAGS:
ref_intervals = [(e['start'], e['end']) for e in references if e['tag'] == tag]
pred_intervals = [(e['start'], e['end']) for e in predictions if e['tag'] == tag]
metrics, _ = compare_intervals(ref_intervals, pred_intervals, alpha, beta)
metrics_dict[tag] = metrics
return metrics_dict
def evaluate_pii_ds(dataset, pred_column='pii', ref_column="secrets", overall_score=False, alpha=0.8, beta=0.8):
"""Evaluate predictions of PII against references in a dataset
"""
metrics_dict = {tag: {'TP': 0, 'FN': 0, 'FP': 0} for tag in TAGS}
for i in range(len(dataset)):
ref_list = load_json(dataset[i][ref_column])
pred_list = load_json(dataset[i][pred_column])
sample_metrics = evaluate_pii(ref_list, pred_list, alpha, beta)
for tag in TAGS:
for metric in metrics_dict[tag]:
metrics_dict[tag][metric] += sample_metrics[tag][metric]
if overall_score:
return recall_precision_all_tags(metrics_dict), metrics_dict
return recall_precision(metrics_dict), metrics_dict |