import json | |
#from utils.emails_ip_addresses_detection import detect_email_addresses | |
from privacy.util.code_detect.utils.emails_ip_addresses_detection import detect_email_addresses | |
from privacy.util.code_detect.utils.keys_detection import detect_keys | |
def postprocess_secrets(secrets): | |
"""Postprocess the secrets found by the scan_secrets function""" | |
if secrets: | |
matches = json.dumps(secrets) | |
has_secrets = True | |
else: | |
matches = json.dumps([]) | |
has_secrets = False | |
return matches, has_secrets | |
## DETECTION MODIFIED FOR FILE | |
def scan_pii_batch(examples, key_detector="other"): | |
"""Scan a batch of examples from a dataset to detect PII | |
This add two columns to the dataset: | |
- secrets: (list) of secrets/PII found | |
- has_secrets: (bool) whether the example contains secrets/PII | |
""" | |
list_secrets = [] | |
list_has_secrets = [] | |
number_secrets = [] | |
for example in examples: | |
text = example["content"] | |
secrets = [] | |
if key_detector == "regex": | |
# use a regex to detect keys + emails + ips | |
secrets = secrets + detect_email_addresses( | |
text, tag_types={"KEY", "EMAIL", "IP_ADDRESS"} | |
) | |
else: | |
# detect emails and ip addresses with regexes | |
secrets = secrets + detect_email_addresses( | |
text, tag_types={"EMAIL", "IP_ADDRESS"} | |
) | |
# for keys use detect-secrets tool | |
secrets = secrets + detect_keys(text) | |
# to add this as new columns to datasets we need the same number of samples in each row | |
# we save secrets as json strings instead of lists | |
matches, has_secrets = postprocess_secrets(secrets) | |
list_secrets.append(matches) | |
list_has_secrets.append(has_secrets) | |
number_secrets.append(len(secrets)) | |
return { | |
"secrets": list_secrets, | |
"has_secrets": list_has_secrets, | |
"number_secrets": number_secrets, | |
} | |
# def scan_pii_batch(examples, key_detector="other"): | |
# """Scan a batch of examples from a dataset to detect PII | |
# This add two columns to the dataset: | |
# - secrets: (list) of secrets/PII found | |
# - has_secrets: (bool) whether the example contains secrets/PII | |
# """ | |
# list_secrets = [] | |
# list_has_secrets = [] | |
# number_secrets = [] | |
# for text in examples["content"]: | |
# secrets = [] | |
# if key_detector == "regex": | |
# # use a regex to detect keys + emails + ips | |
# secrets = secrets + detect_email_addresses( | |
# text, tag_types={"KEY", "EMAIL", "IP_ADDRESS"} | |
# ) | |
# else: | |
# # detect emails and ip addresses with regexes | |
# secrets = secrets + detect_email_addresses( | |
# text, tag_types={"EMAIL", "IP_ADDRESS"} | |
# ) | |
# # for keys use detect-secrets tool | |
# secrets = secrets + detect_keys(text) | |
# # to add this as new columns to datasets we need the same number of samples in each row | |
# # we save secrets as json strings instead of lists | |
# matches, has_secrets = postprocess_secrets(secrets) | |
# list_secrets.append(matches) | |
# list_has_secrets.append(has_secrets) | |
# number_secrets.append(len(secrets)) | |
# return { | |
# "secrets": list_secrets, | |
# "has_secrets": list_has_secrets, | |
# "number_secrets": number_secrets, | |
# } | |