File size: 3,469 Bytes
54fa0c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import json

#from utils.emails_ip_addresses_detection import detect_email_addresses
from privacy.util.code_detect.utils.emails_ip_addresses_detection import detect_email_addresses
from privacy.util.code_detect.utils.keys_detection import detect_keys


def postprocess_secrets(secrets):
    """Postprocess the secrets found by the scan_secrets function"""
    if secrets:
        matches = json.dumps(secrets)
        has_secrets = True
    else:
        matches = json.dumps([])
        has_secrets = False
    return matches, has_secrets

## DETECTION MODIFIED FOR FILE

def scan_pii_batch(examples, key_detector="other"):
    """Scan a batch of examples from a dataset to detect PII
    This add two columns to the dataset:
    - secrets: (list) of secrets/PII found
    - has_secrets: (bool) whether the example contains secrets/PII
    """
    list_secrets = []
    list_has_secrets = []
    number_secrets = []
    for example in examples:
        text = example["content"]
        secrets = []
        if key_detector == "regex":
            # use a regex to detect keys + emails + ips
            secrets = secrets + detect_email_addresses(
                text, tag_types={"KEY", "EMAIL", "IP_ADDRESS"}
            )
        else:
            # detect emails and ip addresses with regexes
            secrets = secrets + detect_email_addresses(
                text, tag_types={"EMAIL", "IP_ADDRESS"}
            )
            # for keys use detect-secrets tool
            secrets = secrets + detect_keys(text)
        # to add this as new columns to datasets we need the same number of samples in each row
        # we save secrets as json strings instead of lists
        matches, has_secrets = postprocess_secrets(secrets)
        list_secrets.append(matches)
        list_has_secrets.append(has_secrets)
        number_secrets.append(len(secrets))
    return {
        "secrets": list_secrets,
        "has_secrets": list_has_secrets,
        "number_secrets": number_secrets,
    }
# def scan_pii_batch(examples, key_detector="other"):
#     """Scan a batch of examples from a dataset to detect PII
#     This add two columns to the dataset:
#     - secrets: (list) of secrets/PII found
#     - has_secrets: (bool) whether the example contains secrets/PII
#     """
#     list_secrets = []
#     list_has_secrets = []
#     number_secrets = []
#     for text in examples["content"]:
#         secrets = []
#         if key_detector == "regex":
#             # use a regex to detect keys + emails + ips
#             secrets = secrets + detect_email_addresses(
#                 text, tag_types={"KEY", "EMAIL", "IP_ADDRESS"}
#             )
#         else:
#             # detect emails and ip addresses with regexes
#             secrets = secrets + detect_email_addresses(
#                 text, tag_types={"EMAIL", "IP_ADDRESS"}
#             )
#             # for keys use detect-secrets tool
#             secrets = secrets + detect_keys(text)
#         # to add this as new columns to datasets we need the same number of samples in each row
#         # we save secrets as json strings instead of lists
#         matches, has_secrets = postprocess_secrets(secrets)
#         list_secrets.append(matches)
#         list_has_secrets.append(has_secrets)
#         number_secrets.append(len(secrets))
#     return {
#         "secrets": list_secrets,
#         "has_secrets": list_has_secrets,
#         "number_secrets": number_secrets,
#     }