File size: 3,469 Bytes
54fa0c8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import json
#from utils.emails_ip_addresses_detection import detect_email_addresses
from privacy.util.code_detect.utils.emails_ip_addresses_detection import detect_email_addresses
from privacy.util.code_detect.utils.keys_detection import detect_keys
def postprocess_secrets(secrets):
"""Postprocess the secrets found by the scan_secrets function"""
if secrets:
matches = json.dumps(secrets)
has_secrets = True
else:
matches = json.dumps([])
has_secrets = False
return matches, has_secrets
## DETECTION MODIFIED FOR FILE
def scan_pii_batch(examples, key_detector="other"):
"""Scan a batch of examples from a dataset to detect PII
This add two columns to the dataset:
- secrets: (list) of secrets/PII found
- has_secrets: (bool) whether the example contains secrets/PII
"""
list_secrets = []
list_has_secrets = []
number_secrets = []
for example in examples:
text = example["content"]
secrets = []
if key_detector == "regex":
# use a regex to detect keys + emails + ips
secrets = secrets + detect_email_addresses(
text, tag_types={"KEY", "EMAIL", "IP_ADDRESS"}
)
else:
# detect emails and ip addresses with regexes
secrets = secrets + detect_email_addresses(
text, tag_types={"EMAIL", "IP_ADDRESS"}
)
# for keys use detect-secrets tool
secrets = secrets + detect_keys(text)
# to add this as new columns to datasets we need the same number of samples in each row
# we save secrets as json strings instead of lists
matches, has_secrets = postprocess_secrets(secrets)
list_secrets.append(matches)
list_has_secrets.append(has_secrets)
number_secrets.append(len(secrets))
return {
"secrets": list_secrets,
"has_secrets": list_has_secrets,
"number_secrets": number_secrets,
}
# def scan_pii_batch(examples, key_detector="other"):
# """Scan a batch of examples from a dataset to detect PII
# This add two columns to the dataset:
# - secrets: (list) of secrets/PII found
# - has_secrets: (bool) whether the example contains secrets/PII
# """
# list_secrets = []
# list_has_secrets = []
# number_secrets = []
# for text in examples["content"]:
# secrets = []
# if key_detector == "regex":
# # use a regex to detect keys + emails + ips
# secrets = secrets + detect_email_addresses(
# text, tag_types={"KEY", "EMAIL", "IP_ADDRESS"}
# )
# else:
# # detect emails and ip addresses with regexes
# secrets = secrets + detect_email_addresses(
# text, tag_types={"EMAIL", "IP_ADDRESS"}
# )
# # for keys use detect-secrets tool
# secrets = secrets + detect_keys(text)
# # to add this as new columns to datasets we need the same number of samples in each row
# # we save secrets as json strings instead of lists
# matches, has_secrets = postprocess_secrets(secrets)
# list_secrets.append(matches)
# list_has_secrets.append(has_secrets)
# number_secrets.append(len(secrets))
# return {
# "secrets": list_secrets,
# "has_secrets": list_has_secrets,
# "number_secrets": number_secrets,
# }
|