Privacy / src /privacy /util /code_detect /regexdetection.py
Kiran5's picture
Track large files and images with Git LFS
54fa0c8
## REGEX FOR FILE
# import argparse
# import json
# import logging
# import random
# from pii_detection import scan_pii_batch
# from pii_redaction import redact_pii_batch, random_replacements
# def parse_args():
# parser = argparse.ArgumentParser(description="PII detection and redaction for a code file")
# parser.add_argument(
# "--input_code_file",
# required=True,
# type=str,
# help="Path to the input code file for PII detection and redaction",
# )
# parser.add_argument(
# "--output_file",
# required=True,
# type=str,
# help="Path to save the redacted code file",
# )
# parser.add_argument(
# "--batch_size",
# default=8,
# type=int,
# help="Batch size for the PII detection/redaction",
# )
# parser.add_argument(
# "--seed",
# default=0,
# type=int,
# help="Seed for random",
# )
# parser.add_argument(
# "--num_proc",
# default=8,
# type=int,
# help="Number of processes to use for PII detection/redaction",
# )
# parser.add_argument(
# "--no_redaction",
# action="store_true",
# help="If set, do not perform redaction",
# )
# parser.add_argument(
# "--load_replacements",
# default=True,
# help="If set, load replacements from file replacements.json",
# )
# parser.add_argument(
# "--add_reference_text",
# default=True,
# type=bool,
# help="If True, add reference text with PII between delimiters in the redacted text (used for visualization)",
# )
# return parser.parse_args()
# def main():
# logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", level=logging.INFO)
# args = parse_args()
# # Read input code file
# with open(args.input_code_file, "r") as input_file:
# code_content = input_file.read()
# # Apply PII detection
# ds_pii = scan_pii_batch([{"content": code_content}])
# logging.info(f"PII detection results:\n{ds_pii}")
# logging.info(f"Number of samples that contained PII: {sum(ds_pii['has_secrets'])}")
# logging.info(f"Total number of secrets found: {sum(ds_pii['number_secrets'])}")
# # Redact PII in the code
# if not args.no_redaction:
# logging.info(f" ===== Applying PII redaction =====")
# random.seed(args.seed)
# # Use random replacements by default
# if args.load_replacements:
# with open("replacements.json", "r") as f:
# replacements = json.load(f)
# else:
# replacements = random_replacements()
# with open("random_replacements.json", "w") as f:
# json.dump(replacements, f)
# logging.info(f"Using the following replacements:\n{replacements}")
# ds_pii_redacted = redact_pii_batch(
# [{"content": code_content, "secrets": ds_pii['secrets'], "has_secrets": ds_pii['has_secrets'], "number_secrets": ds_pii['number_secrets']}],
# replacements=replacements,
# add_references=args.add_reference_text
# )
# redacted_code = ds_pii_redacted["new_content"][0] # Access the redacted code
# print("Redacted Code:")
# print(redacted_code)
# # Save the redacted code to the output file
# with open(args.output_file, "w") as output_file:
# output_file.write(redacted_code[0] if isinstance(redacted_code, list) else redacted_code)
# logging.info("Redacted code saved successfully.")
# if __name__ == "__main__":
# main()
#REGEX AS For text DETECTION
import json
import logging
import random
import os
from privacy.util.code_detect.pii_detection import scan_pii_batch
from privacy.util.code_detect.pii_redaction import redact_pii_batch, random_replacements
class code_detect:
def codeDetectRegex(input_code_text):
#output_file
batch_size=8
seed=0
num_proc=8
no_redaction=False
load_replacements=True
add_reference_text=True
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", level=logging.INFO)
print("input_code_text",input_code_text)
# Apply PII detection
ds_pii = scan_pii_batch([{"content": input_code_text}])
logging.info(f"PII detection results:\n{ds_pii}")
logging.info(f"Number of samples that contained PII: {sum(ds_pii['has_secrets'])}")
logging.info(f"Total number of secrets found: {sum(ds_pii['number_secrets'])}")
# Redact PII in the code
if not no_redaction:
logging.info(f" ===== Applying PII redaction =====")
random.seed(seed)
# Use random replacements by default
if load_replacements:
with open("privacy/util/code_detect/replacements.json", "r") as f:
replacements = json.load(f)
else:
# Get the path to the directory of the current script
current_dir = os.path.dirname(os.path.abspath(__file__))
replacements_file_path = os.path.join(current_dir, "privacy", "util", "code_detect", "replacements.json")
print("replacements_file_path",replacements_file_path)
replacements = random_replacements()
with open(replacements_file_path, "w") as f:
json.dump(replacements, f)
logging.info(f"Using the following replacements:\n{replacements}")
ds_pii_redacted = redact_pii_batch(
[{"content": input_code_text, "secrets": ds_pii['secrets'], "has_secrets": ds_pii['has_secrets'],
"number_secrets": ds_pii['number_secrets']}],
replacements=replacements,
add_references=add_reference_text
)
redacted_code = ds_pii_redacted["new_content"][0] # Access the redacted code
print("Redacted Code:")
print(redacted_code)
# # Save the redacted code to the output file
# with open(output_file, "w") as output_file:
# output_file.write(redacted_code[0] if isinstance(redacted_code, list) else redacted_code)
logging.info("Redacted code saved successfully.")
return redacted_code