File size: 4,245 Bytes
54fa0c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import json
import random
import string
import ipaddress

# List of random private IP addresses to use as replacements
REPLACEMENTS_IP = {
    "IPv4": ["172.16.31.10", "172.16.58.3", "172.16.17.32", "192.168.127.12", "192.168.3.11"],
    "IPv6": [
        "fd00:c2b6:b24b:be67:2827:688d:e6a1:6a3b",
        "fd00:a516:7c1b:17cd:6d81:2137:bd2a:2c5b",
        "fc00:e968:6179::de52:7100",
        "fc00:db20:35b:7399::5",
        "fdf8:f53e:61e4::18",
    ],
}

POPULAR_DNS_SERVERS = [
    "8.8.8.8",
    "8.8.4.4",
    "1.1.1.1",
    "1.0.0.1",
    "76.76.19.19",
    "76.223.122.150",
    "9.9.9.9",
    "149.112.112.112",
    "208.67.222.222",
    "208.67.220.220",
    "8.26.56.26",
    "8.20.247.20",
    "94.140.14.14",
    "94.140.15.15",
]

def load_json(sample):
    try:
        loaded = json.loads(sample)
        if isinstance(loaded, list):
            return loaded
        else:
            raise ValueError("Invalid JSON structure")
    except (json.JSONDecodeError, TypeError, ValueError):
        return sample

def random_replacements(n=10):
    letters = string.ascii_lowercase
    letters_digits = string.ascii_lowercase + string.digits
    emails = ["".join(random.choice(letters) for i in range(5)) + "@example.com" for i in range(n)]
    keys = ["".join(random.choice(letters_digits) for i in range(32)) for i in range(n)]
    ip_addresses = REPLACEMENTS_IP
    return {"EMAIL": emails, "KEY": keys, "IP_ADDRESS": ip_addresses}

def replace_ip(value, replacements_dict):
    try:
        ipaddress.IPv4Address(value)
        return random.choice(replacements_dict["IP_ADDRESS"]["IPv4"])
    except ValueError:
        try:
            ipaddress.IPv6Address(value)
            return random.choice(replacements_dict["IP_ADDRESS"]["IPv6"])
        except ValueError:
            print("Invalid IP address")
            return value

def is_private_ip(ip):
    ip = ipaddress.ip_address(ip)
    return ip.is_private


def redact_pii_text(text, secrets, replacements, add_references=True):
    secrets = load_json(secrets)
    if not secrets or not isinstance(secrets, list):
        return text, "", False

    modified = False
    references = []

    for secret in sorted(secrets, key=lambda x: x["start"], reverse=True):
        print(f"Processing secret: {secret}")
        if secret["tag"] == "IP_ADDRESS" and (is_private_ip(secret["value"]) or secret["value"] in POPULAR_DNS_SERVERS):
            print(f"Skipping redaction for IP: {secret['value']}")
            continue

        modified = True
        replacement_list = replacements.get(secret["tag"], [secret["value"]])
        replacement = replacement_list[0] if isinstance(replacement_list, list) else replacement_list

        # If replacement for IP_ADDRESS is a dictionary, extract the appropriate version
        if secret["tag"] == "IP_ADDRESS" and isinstance(replacement, dict):
            ip_version = "IPv6" if ":" in secret["value"] else "IPv4"
            replacement = replacement[ip_version][0]

        if add_references:
            references.append(f"PI:{secret['tag']}:{replacement}END_PI")

        text = text[:secret["start"]] + str(replacement) + text[secret["end"]:]

    references = "".join(references) if add_references else ""

    return text, references, modified





def redact_pii_batch(examples, replacements, add_references=True):
    new_contents = []
    references = []
    modified = []

    for example in examples:
        text, secrets, has_secrets = example["content"], example["secrets"], example["has_secrets"]
        if has_secrets:
            if add_references:
                new_text, reference, modif = redact_pii_text(
                    text, secrets[0], replacements, add_references
                )
                references.append(reference)
            else:
                new_text, modif = redact_pii_text(text, secrets[0], replacements)
            new_contents.append(new_text)
            modified.append(modif)
        else:
            new_contents.append(text)
            references.append(text)
            modified.append(False)

    result = {"new_content": new_contents, "modified": modified}
    if add_references:
        result.update({"references": references})
    return result