|
import json |
|
|
|
TAGS = ['EMAIL', 'IP_ADDRESS', 'KEY'] |
|
|
|
|
|
def load_json(sample): |
|
try: |
|
return json.loads(sample) |
|
except ValueError: |
|
return [] |
|
|
|
|
|
def overlapped(a, b, alpha=0.8, beta=0.8): |
|
"""Returns True if the intervals a and b overlap for more than 80% of their lengths""" |
|
size_overlap = max(0, min(a[1], b[1]) - max(a[0], b[0])) |
|
ref_overlap = size_overlap / (b[1] - b[0]) |
|
pred_overlap = size_overlap / (a[1] - a[0]) |
|
return (ref_overlap > alpha and pred_overlap > beta) |
|
|
|
|
|
def compare_intervals(references, predictions, alpha=0.8, beta=0.8): |
|
"""Compare two lists of intervals and return the number |
|
of true positives, false positives and false negatives. |
|
>>> compare_intervals([(0, 7), (10, 20)], [(1,8), (99, 119)], 0, 0)[0] |
|
{'TP': 1, 'FN': 1, 'FP': 1} |
|
""" |
|
ref_intervals = sorted(references, key=lambda x: x[0]) |
|
pred_intervals = sorted(predictions, key=lambda x: x[0]) |
|
scores = {"TP": 0, "FN": 0, "FP": 0} |
|
detected_secrets = [] |
|
for interval in pred_intervals: |
|
for target in ref_intervals: |
|
if overlapped(interval, target, alpha, beta): |
|
|
|
scores["TP"] += 1 |
|
detected_secrets.append(interval) |
|
break |
|
else: |
|
|
|
scores["FP"] += 1 |
|
|
|
scores["FN"] += len(ref_intervals) - len(detected_secrets) |
|
return scores, detected_secrets |
|
|
|
|
|
def recall_precision(metrics_dict): |
|
"""Compute recall and precision for each tag""" |
|
metrics = {} |
|
for tag in TAGS: |
|
metrics[tag] = {} |
|
total = metrics_dict[tag]['TP'] + metrics_dict[tag]['FN'] + metrics_dict[tag]['FP'] |
|
if total: |
|
if not (metrics_dict[tag]['TP'] + metrics_dict[tag]['FN']) or not (metrics_dict[tag]['TP'] + metrics_dict[tag]['FP']): |
|
|
|
metrics[tag] = {'recall': 0, 'precision': 0} |
|
else: |
|
metrics[tag]['recall'] = metrics_dict[tag]['TP'] / (metrics_dict[tag]['TP'] + metrics_dict[tag]['FN']) |
|
metrics[tag]['precision'] = metrics_dict[tag]['TP'] / (metrics_dict[tag]['TP'] + metrics_dict[tag]['FP']) |
|
else: |
|
|
|
metrics[tag] = {'recall': 1.0, 'precision': 1.0} |
|
return metrics |
|
|
|
|
|
def recall_precision_all_tags(metrics_dict): |
|
"""Compute recall and precision for all tags""" |
|
metrics = {} |
|
TP = sum([metrics_dict[tag]['TP'] for tag in TAGS]) |
|
FN = sum([metrics_dict[tag]['FN'] for tag in TAGS]) |
|
FP = sum([metrics_dict[tag]['FP'] for tag in TAGS]) |
|
if not (TP + FN) or not (TP + FP): |
|
metrics = {'recall': 0, 'precision': 0} |
|
else: |
|
metrics['recall'] = TP / (TP + FN) |
|
metrics['precision'] = TP / (TP + FP) |
|
return metrics |
|
|
|
|
|
def evaluate_pii(references, predictions, alpha=0.8, beta=0.8): |
|
"""Evaluate predictions of PII against references""" |
|
metrics_dict = {} |
|
for tag in TAGS: |
|
ref_intervals = [(e['start'], e['end']) for e in references if e['tag'] == tag] |
|
pred_intervals = [(e['start'], e['end']) for e in predictions if e['tag'] == tag] |
|
metrics, _ = compare_intervals(ref_intervals, pred_intervals, alpha, beta) |
|
metrics_dict[tag] = metrics |
|
return metrics_dict |
|
|
|
|
|
def evaluate_pii_ds(dataset, pred_column='pii', ref_column="secrets", overall_score=False, alpha=0.8, beta=0.8): |
|
"""Evaluate predictions of PII against references in a dataset |
|
""" |
|
metrics_dict = {tag: {'TP': 0, 'FN': 0, 'FP': 0} for tag in TAGS} |
|
for i in range(len(dataset)): |
|
ref_list = load_json(dataset[i][ref_column]) |
|
pred_list = load_json(dataset[i][pred_column]) |
|
sample_metrics = evaluate_pii(ref_list, pred_list, alpha, beta) |
|
for tag in TAGS: |
|
for metric in metrics_dict[tag]: |
|
metrics_dict[tag][metric] += sample_metrics[tag][metric] |
|
if overall_score: |
|
return recall_precision_all_tags(metrics_dict), metrics_dict |
|
return recall_precision(metrics_dict), metrics_dict |