Spaces:

Kiran5
/

Privacy

Build error

File size: 7,256 Bytes

54fa0c8

import ipaddress
import random
from gibberish_detector import detector
## USERNAME IS IN IGNORE BECAUSE MODEL IS DETECTING FALSE POSITIVES
## It is given in ignore list, in starCoder repo itself
IGNORE = ["USERNAME","PASSWORD","AMBIGUOUS"]
# List of random private IP addresses to use as replacements
REPLACEMENTS_IP = {
    "IPv4": [
        "172.16.31.10",
        "172.16.58.3",
        "172.16.17.32",
        "192.168.127.12",
        "192.168.3.11",
    ],
    "IPv6": [
        "fd00:c2b6:b24b:be67:2827:688d:e6a1:6a3b",
        "fd00:a516:7c1b:17cd:6d81:2137:bd2a:2c5b",
        "fc00:e968:6179::de52:7100",
        "fc00:db20:35b:7399::5",
        "fdf8:f53e:61e4::18",
    ],
}

# DNS to avoid masking
POPULAR_DNS_SERVERS = [
    "8.8.8.8",
    "8.8.4.4",
    "1.1.1.1",
    "1.0.0.1",
    "76.76.19.19",
    "76.223.122.150",
    "9.9.9.9",
    "149.112.112.112",
    "208.67.222.222",
    "208.67.220.220",
    "8.26.56.26",
    "8.20.247.20",
    "94.140.14.14",
    "94.140.15.15",
]


def is_key(matched_str):
    """Checks to make sure the PII span is long enough and is gibberish and not word like"""
    # pip install gibberish-detector
    # download the training corpora from https://raw.githubusercontent.com/domanchi/gibberish-detector/master/examples/big.txt
    # run gibberish-detector train big.txt > big.model to generate the model (it takes 3 seconds)
    Detector = detector.create_from_model(
        "privacy/util/code_detect/ner/pii_redaction/gibberish_data/big.model"
    )
    is_gibberish = Detector.is_gibberish(matched_str.lower())
    return is_gibberish and len(matched_str) > 8


def is_secret(matched_str):
    """Checks to make sure the PII span is long enough"""
    return len(matched_str) > 3


def is_full_name(matched_str):
    """Checks if detected name is a full names and not just first or last name"""
    return len(matched_str.split()) > 1


def get_replacements():
    """Build dictionaries of replacements for PII (key, email, IP address, name, password)"""
    ip_addresses = REPLACEMENTS_IP
    return {
        "EMAIL": ["<EMAIL>"],
        "KEY": ["<KEY>"],
        "NAME": ["<NAME>"],
        "IP_ADDRESS": ["<IP_ADDRESS>"],
        # "USERNAME" : ["<USERNAME>"]
    }


# def replace_ip(value, replacements_dict):
#     """Replace an IP address with a synthetic IP address of the same format"""
#     try:
#         ipaddress.IPv4Address(value)
#         return random.choice(replacements_dict["IP_ADDRESS"]["IPv4"])
#     except ValueError:
#         try:
#             ipaddress.IPv6Address(value)
#             return random.choice(replacements_dict["IP_ADDRESS"]["IPv6"])
#         except ValueError:
#             # this doesn't happen if we already use ipaddress filter in the detection
#             print("Invalid IP address")
#             return value

def replace_ip(value):
    """Replace an IP address with a synthetic IP address of the same format"""
    return "<IP_ADDRESS>"


def is_secret_ip(ip):
    """Check if an IP address is allocated for private networks (non internet facing), or is not an ip address at all"""
    try:
        ip = ipaddress.ip_address(ip)
    except ValueError:
        # not an ip address
        return True
    return ip.is_private


def redact_pii_text(text, secrets, replacements, add_references=False):
    """Redact PII in a text
    Args:
        text (str): text to redact
        secrets (list): list with the secrets to redact
        replacements (dict): dictionary of replacements for each PII type
        add_references (bool): whether to add references to the redacted text (delimiters to PII)
        for vizualization
    Returns:
        text (str): new text with redacted secrets
    """
    modified = False
    if secrets:
        secrets = sorted(secrets, key=lambda x: x["start"])
        # store the secrets that were replaced here with their replacements
        replaced_secrets = {}
        subparts = []
        references = []
        step = 0
        last_text = text
        for secret in secrets:
            # Debug: print each secret being processed
            print(f"Processing secret: {secret}")

            # some post-processing 
            if secret["tag"] in IGNORE or not is_secret(secret["value"]):
                continue
            if secret["tag"] == "IP_ADDRESS":
                print("IP_ADDRESS detected")
                # skip if it's not actual ip address, is a popular DNS server or private IP address
                if is_secret_ip(secret["value"]) or (
                    secret["value"] in POPULAR_DNS_SERVERS
                ):
                    continue
            if secret["tag"] == "KEY" and not is_key(secret["value"]):
                continue
            if secret["tag"] == "NAME" and not is_full_name(secret["value"]):
                continue
            modified = True
            subtext = text[step : secret["start"]]
            subpart = subtext if subtext else " "
            subparts.append(subpart)
            # if secret is already in replaced_secrets, use the same replacement
            if secret["value"] in replaced_secrets:
                replacement = replaced_secrets[secret["value"]]
            else:
                if secret["tag"] == "IP_ADDRESS":
                    replacement = replace_ip(secret["value"])
                else:
                    replacement = random.choice(replacements[secret["tag"]])
                replaced_secrets[secret["value"]] = replacement
            subparts.append(replacement)
            replaced_secrets[secret["value"]] = replacement
            if add_references:
                references.append(subpart)
                references.append(f"PI:{secret['tag']}:{replacement}END_PI")
            last_text = text[secret["end"] :]
            step = secret["end"]
        # if subparts are not empty join them (it can be empty when all secrets were skipped)
        new_text = "".join(subparts) + last_text if subparts else last_text
        if add_references:
            references = "".join(references) + last_text if references else ""
    else:
        new_text = text
        references = ""
    result = (
        (new_text, references, modified) if add_references else (new_text, modified)
    )
    return result


def redact_pii_batch(examples, replacements, add_references=True):
    """Anonymize PII in a batch of examples from a dataset"""
    new_contents = []
    references = []
    modified = []
    for text, secrets in zip(
        examples["content"],
        examples["entities"],
    ):
        if secrets:
            if add_references:
                new_text, reference, modif = redact_pii_text(
                    text, secrets, replacements, add_references
                )
                references.append(reference)
            else:
                new_text, modif = redact_pii_text(text, secrets, replacements)
            new_contents.append(new_text)
            modified.append(modif)
        else:
            new_contents.append(text)
            references.append(text)
            modified.append(False)
    result = {"new_content": new_contents, "modified": modified}
    if add_references:
        result.update({"references": references})
    return result