File size: 5,130 Bytes
54fa0c8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import os
import tempfile
from detect_secrets import SecretsCollection
from detect_secrets.settings import transient_settings
from gibberish_detector import detector
# Secrets detection with detect-secrets tool
filters = [
# some filters from [original list](https://github.com/Yelp/detect-secrets/blob/master/docs/filters.md#built-in-filters)
# were removed based on their targets
{"path": "detect_secrets.filters.heuristic.is_potential_uuid"},
{"path": "detect_secrets.filters.heuristic.is_likely_id_string"},
{"path": "detect_secrets.filters.heuristic.is_templated_secret"},
{"path": "detect_secrets.filters.heuristic.is_sequential_string"},
]
plugins = [
{"name": "ArtifactoryDetector"},
{"name": "AWSKeyDetector"},
# the entropy detectors esp Base64 need the gibberish detector on top
{"name": "Base64HighEntropyString"},
{"name": "HexHighEntropyString"},
{"name": "AzureStorageKeyDetector"},
{"name": "CloudantDetector"},
{"name": "DiscordBotTokenDetector"},
{"name": "GitHubTokenDetector"},
{"name": "IbmCloudIamDetector"},
{"name": "IbmCosHmacDetector"},
{"name": "JwtTokenDetector"},
{"name": "MailchimpDetector"},
{"name": "NpmDetector"},
{"name": "SendGridDetector"},
{"name": "SlackDetector"},
{"name": "SoftlayerDetector"},
{"name": "StripeDetector"},
{"name": "TwilioKeyDetector"},
# remove 3 plugins for keyword
# {'name': 'BasicAuthDetector'},
# {'name': 'KeywordDetector'},
# {'name': 'PrivateKeyDetector'},
]
def is_gibberish(matched_str):
"""Checks to make sure the PII span is gibberish and not word like"""
# pip install gibberish-detector
# download the training corpora from https://raw.githubusercontent.com/domanchi/gibberish-detector/master/examples/big.txt
# run gibberish-detector train big.txt > big.model to generate the model (it takes 3 seconds)
# Detector = detector.create_from_model('gibberish_data/big.model')
Detector = detector.create_from_model('privacy/util/code_detect/gibberish_data/big.model')
return Detector.is_gibberish(matched_str.lower())
def is_hash(content, value):
"""Second check if the value is a hash (after gibberish detector)"""
# get the line where value occurred
try:
res = content.index(value)
except ValueError:
# TODO: fix this issue happened one for JS in the stack-smol, file did contain value
print("Value not found in content, why this happened?")
return False
lines = content[:content.index(value)].splitlines()
target_line = lines[-1]
if len(value) in [32, 40, 64]:
# if "sha" or "md5" are in content:
keywords = ["sha", "md5", "hash", "byte"]
if any(x in target_line.lower() for x in keywords):
return True
return False
def file_has_hashes(content, coeff = 0.02):
"""Checks if the file contains literals 'hash' or 'sha' for more than 2% nb_of_lines"""
lines = content.splitlines()
count_sha = 0
count_hash = 0
nlines = content.count("\n")
threshold = int(coeff * nlines)
for line in lines:
count_sha += line.lower().count("sha")
count_hash += line.lower().count("hash")
if count_sha > threshold or count_hash > threshold:
return True
return False
def get_indexes(text, value):
string = text
indexes = []
new_start = 0
while True:
try:
start = string.index(value)
indexes.append(new_start + start)
new_start = new_start + start + len(value)
string = text[new_start:]
except ValueError:
break
indexes = [(x, x + len(value)) for x in indexes]
return indexes
def detect_keys(content, suffix=".txt"):
"""Detect secret keys in content using detect-secrets tool
Args:
content (str): string containing the text to be analyzed.
suffix (str): suffix of the file
Returns:
A list of dicts containing the tag type, the matched string, and the start and
end indices of the match."""
fp = tempfile.NamedTemporaryFile(suffix=suffix, delete=False, mode="w", encoding='utf-8')
fp.write(content)
fp.close()
secrets = SecretsCollection()
with transient_settings(
{"plugins_used": plugins, "filters_used": filters}
) as settings:
secrets.scan_file(fp.name)
os.unlink(fp.name)
secrets_set = list(secrets.data.values())
matches = []
if secrets_set:
for secret in secrets_set[0]:
if not is_gibberish(secret.secret_value):
continue
if is_hash(content, secret.secret_value) or file_has_hashes(content):
continue
indexes = get_indexes(content, secret.secret_value)
for start, end in indexes:
matches.append(
{
"tag": "KEY",
"value": secret.secret_value,
"start": start,
"end": end,
}
)
return matches
|