File size: 4,245 Bytes
54fa0c8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import json
import random
import string
import ipaddress
# List of random private IP addresses to use as replacements
REPLACEMENTS_IP = {
"IPv4": ["172.16.31.10", "172.16.58.3", "172.16.17.32", "192.168.127.12", "192.168.3.11"],
"IPv6": [
"fd00:c2b6:b24b:be67:2827:688d:e6a1:6a3b",
"fd00:a516:7c1b:17cd:6d81:2137:bd2a:2c5b",
"fc00:e968:6179::de52:7100",
"fc00:db20:35b:7399::5",
"fdf8:f53e:61e4::18",
],
}
POPULAR_DNS_SERVERS = [
"8.8.8.8",
"8.8.4.4",
"1.1.1.1",
"1.0.0.1",
"76.76.19.19",
"76.223.122.150",
"9.9.9.9",
"149.112.112.112",
"208.67.222.222",
"208.67.220.220",
"8.26.56.26",
"8.20.247.20",
"94.140.14.14",
"94.140.15.15",
]
def load_json(sample):
try:
loaded = json.loads(sample)
if isinstance(loaded, list):
return loaded
else:
raise ValueError("Invalid JSON structure")
except (json.JSONDecodeError, TypeError, ValueError):
return sample
def random_replacements(n=10):
letters = string.ascii_lowercase
letters_digits = string.ascii_lowercase + string.digits
emails = ["".join(random.choice(letters) for i in range(5)) + "@example.com" for i in range(n)]
keys = ["".join(random.choice(letters_digits) for i in range(32)) for i in range(n)]
ip_addresses = REPLACEMENTS_IP
return {"EMAIL": emails, "KEY": keys, "IP_ADDRESS": ip_addresses}
def replace_ip(value, replacements_dict):
try:
ipaddress.IPv4Address(value)
return random.choice(replacements_dict["IP_ADDRESS"]["IPv4"])
except ValueError:
try:
ipaddress.IPv6Address(value)
return random.choice(replacements_dict["IP_ADDRESS"]["IPv6"])
except ValueError:
print("Invalid IP address")
return value
def is_private_ip(ip):
ip = ipaddress.ip_address(ip)
return ip.is_private
def redact_pii_text(text, secrets, replacements, add_references=True):
secrets = load_json(secrets)
if not secrets or not isinstance(secrets, list):
return text, "", False
modified = False
references = []
for secret in sorted(secrets, key=lambda x: x["start"], reverse=True):
print(f"Processing secret: {secret}")
if secret["tag"] == "IP_ADDRESS" and (is_private_ip(secret["value"]) or secret["value"] in POPULAR_DNS_SERVERS):
print(f"Skipping redaction for IP: {secret['value']}")
continue
modified = True
replacement_list = replacements.get(secret["tag"], [secret["value"]])
replacement = replacement_list[0] if isinstance(replacement_list, list) else replacement_list
# If replacement for IP_ADDRESS is a dictionary, extract the appropriate version
if secret["tag"] == "IP_ADDRESS" and isinstance(replacement, dict):
ip_version = "IPv6" if ":" in secret["value"] else "IPv4"
replacement = replacement[ip_version][0]
if add_references:
references.append(f"PI:{secret['tag']}:{replacement}END_PI")
text = text[:secret["start"]] + str(replacement) + text[secret["end"]:]
references = "".join(references) if add_references else ""
return text, references, modified
def redact_pii_batch(examples, replacements, add_references=True):
new_contents = []
references = []
modified = []
for example in examples:
text, secrets, has_secrets = example["content"], example["secrets"], example["has_secrets"]
if has_secrets:
if add_references:
new_text, reference, modif = redact_pii_text(
text, secrets[0], replacements, add_references
)
references.append(reference)
else:
new_text, modif = redact_pii_text(text, secrets[0], replacements)
new_contents.append(new_text)
modified.append(modif)
else:
new_contents.append(text)
references.append(text)
modified.append(False)
result = {"new_content": new_contents, "modified": modified}
if add_references:
result.update({"references": references})
return result
|