Spaces:

Kiran5
/

Privacy

Build error

App Files Files Community

Privacy / src /privacy /util /code_detect /utils /keys_detection.py

Kiran5

Track large files and images with Git LFS

54fa0c8 6 months ago

raw

history blame contribute delete

5.13 kB

	import os
	import tempfile

	from detect_secrets import SecretsCollection
	from detect_secrets.settings import transient_settings
	from gibberish_detector import detector

	# Secrets detection with detect-secrets tool


	filters = [
	# some filters from [original list](https://github.com/Yelp/detect-secrets/blob/master/docs/filters.md#built-in-filters)
	# were removed based on their targets
	{"path": "detect_secrets.filters.heuristic.is_potential_uuid"},
	{"path": "detect_secrets.filters.heuristic.is_likely_id_string"},
	{"path": "detect_secrets.filters.heuristic.is_templated_secret"},
	{"path": "detect_secrets.filters.heuristic.is_sequential_string"},
	]
	plugins = [
	{"name": "ArtifactoryDetector"},
	{"name": "AWSKeyDetector"},
	# the entropy detectors esp Base64 need the gibberish detector on top
	{"name": "Base64HighEntropyString"},
	{"name": "HexHighEntropyString"},
	{"name": "AzureStorageKeyDetector"},
	{"name": "CloudantDetector"},
	{"name": "DiscordBotTokenDetector"},
	{"name": "GitHubTokenDetector"},
	{"name": "IbmCloudIamDetector"},
	{"name": "IbmCosHmacDetector"},
	{"name": "JwtTokenDetector"},
	{"name": "MailchimpDetector"},
	{"name": "NpmDetector"},
	{"name": "SendGridDetector"},
	{"name": "SlackDetector"},
	{"name": "SoftlayerDetector"},
	{"name": "StripeDetector"},
	{"name": "TwilioKeyDetector"},
	# remove 3 plugins for keyword
	# {'name': 'BasicAuthDetector'},
	# {'name': 'KeywordDetector'},
	# {'name': 'PrivateKeyDetector'},
	]


	def is_gibberish(matched_str):
	"""Checks to make sure the PII span is gibberish and not word like"""
	# pip install gibberish-detector
	# download the training corpora from https://raw.githubusercontent.com/domanchi/gibberish-detector/master/examples/big.txt
	# run gibberish-detector train big.txt > big.model to generate the model (it takes 3 seconds)
	# Detector = detector.create_from_model('gibberish_data/big.model')
	Detector = detector.create_from_model('privacy/util/code_detect/gibberish_data/big.model')
	return Detector.is_gibberish(matched_str.lower())


	def is_hash(content, value):
	"""Second check if the value is a hash (after gibberish detector)"""
	# get the line where value occurred
	try:
	res = content.index(value)
	except ValueError:
	# TODO: fix this issue happened one for JS in the stack-smol, file did contain value
	print("Value not found in content, why this happened?")
	return False
	lines = content[:content.index(value)].splitlines()
	target_line = lines[-1]
	if len(value) in [32, 40, 64]:
	# if "sha" or "md5" are in content:
	keywords = ["sha", "md5", "hash", "byte"]
	if any(x in target_line.lower() for x in keywords):
	return True
	return False

	def file_has_hashes(content, coeff = 0.02):
	"""Checks if the file contains literals 'hash' or 'sha' for more than 2% nb_of_lines"""
	lines = content.splitlines()
	count_sha = 0
	count_hash = 0
	nlines = content.count("\n")
	threshold = int(coeff * nlines)
	for line in lines:
	count_sha += line.lower().count("sha")
	count_hash += line.lower().count("hash")
	if count_sha > threshold or count_hash > threshold:
	return True
	return False

	def get_indexes(text, value):
	string = text
	indexes = []
	new_start = 0
	while True:
	try:
	start = string.index(value)
	indexes.append(new_start + start)
	new_start = new_start + start + len(value)
	string = text[new_start:]
	except ValueError:
	break
	indexes = [(x, x + len(value)) for x in indexes]
	return indexes


	def detect_keys(content, suffix=".txt"):
	"""Detect secret keys in content using detect-secrets tool
	Args:
	content (str): string containing the text to be analyzed.
	suffix (str): suffix of the file
	Returns:
	A list of dicts containing the tag type, the matched string, and the start and
	end indices of the match."""

	fp = tempfile.NamedTemporaryFile(suffix=suffix, delete=False, mode="w", encoding='utf-8')
	fp.write(content)
	fp.close()
	secrets = SecretsCollection()
	with transient_settings(
	{"plugins_used": plugins, "filters_used": filters}
	) as settings:
	secrets.scan_file(fp.name)
	os.unlink(fp.name)
	secrets_set = list(secrets.data.values())
	matches = []
	if secrets_set:
	for secret in secrets_set[0]:
	if not is_gibberish(secret.secret_value):
	continue
	if is_hash(content, secret.secret_value) or file_has_hashes(content):
	continue
	indexes = get_indexes(content, secret.secret_value)
	for start, end in indexes:
	matches.append(
	{
	"tag": "KEY",
	"value": secret.secret_value,
	"start": start,
	"end": end,
	}
	)
	return matches