Privacy / src /privacy /util /code_detect /pii_redaction.py
Kiran5's picture
Track large files and images with Git LFS
54fa0c8
import json
import random
import string
import ipaddress
# List of random private IP addresses to use as replacements
REPLACEMENTS_IP = {
"IPv4": ["172.16.31.10", "172.16.58.3", "172.16.17.32", "192.168.127.12", "192.168.3.11"],
"IPv6": [
"fd00:c2b6:b24b:be67:2827:688d:e6a1:6a3b",
"fd00:a516:7c1b:17cd:6d81:2137:bd2a:2c5b",
"fc00:e968:6179::de52:7100",
"fc00:db20:35b:7399::5",
"fdf8:f53e:61e4::18",
],
}
POPULAR_DNS_SERVERS = [
"8.8.8.8",
"8.8.4.4",
"1.1.1.1",
"1.0.0.1",
"76.76.19.19",
"76.223.122.150",
"9.9.9.9",
"149.112.112.112",
"208.67.222.222",
"208.67.220.220",
"8.26.56.26",
"8.20.247.20",
"94.140.14.14",
"94.140.15.15",
]
def load_json(sample):
try:
loaded = json.loads(sample)
if isinstance(loaded, list):
return loaded
else:
raise ValueError("Invalid JSON structure")
except (json.JSONDecodeError, TypeError, ValueError):
return sample
def random_replacements(n=10):
letters = string.ascii_lowercase
letters_digits = string.ascii_lowercase + string.digits
emails = ["".join(random.choice(letters) for i in range(5)) + "@example.com" for i in range(n)]
keys = ["".join(random.choice(letters_digits) for i in range(32)) for i in range(n)]
ip_addresses = REPLACEMENTS_IP
return {"EMAIL": emails, "KEY": keys, "IP_ADDRESS": ip_addresses}
def replace_ip(value, replacements_dict):
try:
ipaddress.IPv4Address(value)
return random.choice(replacements_dict["IP_ADDRESS"]["IPv4"])
except ValueError:
try:
ipaddress.IPv6Address(value)
return random.choice(replacements_dict["IP_ADDRESS"]["IPv6"])
except ValueError:
print("Invalid IP address")
return value
def is_private_ip(ip):
ip = ipaddress.ip_address(ip)
return ip.is_private
def redact_pii_text(text, secrets, replacements, add_references=True):
secrets = load_json(secrets)
if not secrets or not isinstance(secrets, list):
return text, "", False
modified = False
references = []
for secret in sorted(secrets, key=lambda x: x["start"], reverse=True):
print(f"Processing secret: {secret}")
if secret["tag"] == "IP_ADDRESS" and (is_private_ip(secret["value"]) or secret["value"] in POPULAR_DNS_SERVERS):
print(f"Skipping redaction for IP: {secret['value']}")
continue
modified = True
replacement_list = replacements.get(secret["tag"], [secret["value"]])
replacement = replacement_list[0] if isinstance(replacement_list, list) else replacement_list
# If replacement for IP_ADDRESS is a dictionary, extract the appropriate version
if secret["tag"] == "IP_ADDRESS" and isinstance(replacement, dict):
ip_version = "IPv6" if ":" in secret["value"] else "IPv4"
replacement = replacement[ip_version][0]
if add_references:
references.append(f"PI:{secret['tag']}:{replacement}END_PI")
text = text[:secret["start"]] + str(replacement) + text[secret["end"]:]
references = "".join(references) if add_references else ""
return text, references, modified
def redact_pii_batch(examples, replacements, add_references=True):
new_contents = []
references = []
modified = []
for example in examples:
text, secrets, has_secrets = example["content"], example["secrets"], example["has_secrets"]
if has_secrets:
if add_references:
new_text, reference, modif = redact_pii_text(
text, secrets[0], replacements, add_references
)
references.append(reference)
else:
new_text, modif = redact_pii_text(text, secrets[0], replacements)
new_contents.append(new_text)
modified.append(modif)
else:
new_contents.append(text)
references.append(text)
modified.append(False)
result = {"new_content": new_contents, "modified": modified}
if add_references:
result.update({"references": references})
return result