File size: 5,130 Bytes
54fa0c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import os
import tempfile

from detect_secrets import SecretsCollection
from detect_secrets.settings import transient_settings
from gibberish_detector import detector

# Secrets detection with detect-secrets tool


filters = [
    # some filters from [original list](https://github.com/Yelp/detect-secrets/blob/master/docs/filters.md#built-in-filters)
    # were removed based on their targets
    {"path": "detect_secrets.filters.heuristic.is_potential_uuid"},
    {"path": "detect_secrets.filters.heuristic.is_likely_id_string"},
    {"path": "detect_secrets.filters.heuristic.is_templated_secret"},
    {"path": "detect_secrets.filters.heuristic.is_sequential_string"},
]
plugins = [
    {"name": "ArtifactoryDetector"},
    {"name": "AWSKeyDetector"},
    # the entropy detectors esp Base64 need the gibberish detector on top
    {"name": "Base64HighEntropyString"},
    {"name": "HexHighEntropyString"},
    {"name": "AzureStorageKeyDetector"},
    {"name": "CloudantDetector"},
    {"name": "DiscordBotTokenDetector"},
    {"name": "GitHubTokenDetector"},
    {"name": "IbmCloudIamDetector"},
    {"name": "IbmCosHmacDetector"},
    {"name": "JwtTokenDetector"},
    {"name": "MailchimpDetector"},
    {"name": "NpmDetector"},
    {"name": "SendGridDetector"},
    {"name": "SlackDetector"},
    {"name": "SoftlayerDetector"},
    {"name": "StripeDetector"},
    {"name": "TwilioKeyDetector"},
    # remove 3 plugins for keyword
    # {'name': 'BasicAuthDetector'},
    # {'name': 'KeywordDetector'},
    # {'name': 'PrivateKeyDetector'},
]


def is_gibberish(matched_str):
    """Checks to make sure the PII span is gibberish and not word like"""
    # pip install gibberish-detector
    # download the training corpora from https://raw.githubusercontent.com/domanchi/gibberish-detector/master/examples/big.txt
    # run gibberish-detector train big.txt > big.model to generate the model (it takes 3 seconds)
    # Detector = detector.create_from_model('gibberish_data/big.model')
    Detector = detector.create_from_model('privacy/util/code_detect/gibberish_data/big.model')
    return Detector.is_gibberish(matched_str.lower())


def is_hash(content, value):
    """Second check if the value is a hash (after gibberish detector)"""
    # get the line where value occurred
    try:
        res = content.index(value)
    except ValueError:
        # TODO: fix this issue happened one for JS in the stack-smol, file did contain value
        print("Value not found in content, why this happened?")
        return False
    lines = content[:content.index(value)].splitlines()
    target_line = lines[-1]
    if len(value) in [32, 40, 64]:
        # if "sha" or "md5" are in content:
        keywords = ["sha", "md5", "hash", "byte"]
        if any(x in target_line.lower() for x in keywords):
            return True
    return False

def file_has_hashes(content, coeff = 0.02):
    """Checks if the file contains literals 'hash' or 'sha' for more than 2% nb_of_lines"""
    lines = content.splitlines()
    count_sha = 0
    count_hash = 0
    nlines = content.count("\n")
    threshold = int(coeff * nlines)
    for line in lines:
        count_sha += line.lower().count("sha")
        count_hash += line.lower().count("hash")
        if count_sha > threshold or count_hash > threshold:
            return True
    return False

def get_indexes(text, value):
    string = text
    indexes = []
    new_start = 0
    while True:
        try:
            start = string.index(value)
            indexes.append(new_start + start)
            new_start = new_start + start + len(value)
            string = text[new_start:]
        except ValueError:
            break
    indexes = [(x, x + len(value)) for x in indexes]
    return indexes


def detect_keys(content, suffix=".txt"):
    """Detect secret keys in content using detect-secrets tool
    Args:
        content (str): string containing the text to be analyzed.
        suffix (str): suffix of the file
    Returns:
        A list of dicts containing the tag type, the matched string, and the start and
        end indices of the match."""

    fp = tempfile.NamedTemporaryFile(suffix=suffix, delete=False, mode="w", encoding='utf-8')
    fp.write(content)
    fp.close()
    secrets = SecretsCollection()
    with transient_settings(
        {"plugins_used": plugins, "filters_used": filters}
    ) as settings:
        secrets.scan_file(fp.name)
    os.unlink(fp.name)
    secrets_set = list(secrets.data.values())
    matches = []
    if secrets_set:
        for secret in secrets_set[0]:
            if not is_gibberish(secret.secret_value):
                continue
            if is_hash(content, secret.secret_value) or file_has_hashes(content):
                continue
            indexes = get_indexes(content, secret.secret_value)
            for start, end in indexes:
                matches.append(
                    {
                        "tag": "KEY",
                        "value": secret.secret_value,
                        "start": start,
                        "end": end,
                    }
                )
    return matches