## REGEX FOR FILE # import argparse # import json # import logging # import random # from pii_detection import scan_pii_batch # from pii_redaction import redact_pii_batch, random_replacements # def parse_args(): # parser = argparse.ArgumentParser(description="PII detection and redaction for a code file") # parser.add_argument( # "--input_code_file", # required=True, # type=str, # help="Path to the input code file for PII detection and redaction", # ) # parser.add_argument( # "--output_file", # required=True, # type=str, # help="Path to save the redacted code file", # ) # parser.add_argument( # "--batch_size", # default=8, # type=int, # help="Batch size for the PII detection/redaction", # ) # parser.add_argument( # "--seed", # default=0, # type=int, # help="Seed for random", # ) # parser.add_argument( # "--num_proc", # default=8, # type=int, # help="Number of processes to use for PII detection/redaction", # ) # parser.add_argument( # "--no_redaction", # action="store_true", # help="If set, do not perform redaction", # ) # parser.add_argument( # "--load_replacements", # default=True, # help="If set, load replacements from file replacements.json", # ) # parser.add_argument( # "--add_reference_text", # default=True, # type=bool, # help="If True, add reference text with PII between delimiters in the redacted text (used for visualization)", # ) # return parser.parse_args() # def main(): # logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", level=logging.INFO) # args = parse_args() # # Read input code file # with open(args.input_code_file, "r") as input_file: # code_content = input_file.read() # # Apply PII detection # ds_pii = scan_pii_batch([{"content": code_content}]) # logging.info(f"PII detection results:\n{ds_pii}") # logging.info(f"Number of samples that contained PII: {sum(ds_pii['has_secrets'])}") # logging.info(f"Total number of secrets found: {sum(ds_pii['number_secrets'])}") # # Redact PII in the code # if not args.no_redaction: # logging.info(f" ===== Applying PII redaction =====") # random.seed(args.seed) # # Use random replacements by default # if args.load_replacements: # with open("replacements.json", "r") as f: # replacements = json.load(f) # else: # replacements = random_replacements() # with open("random_replacements.json", "w") as f: # json.dump(replacements, f) # logging.info(f"Using the following replacements:\n{replacements}") # ds_pii_redacted = redact_pii_batch( # [{"content": code_content, "secrets": ds_pii['secrets'], "has_secrets": ds_pii['has_secrets'], "number_secrets": ds_pii['number_secrets']}], # replacements=replacements, # add_references=args.add_reference_text # ) # redacted_code = ds_pii_redacted["new_content"][0] # Access the redacted code # print("Redacted Code:") # print(redacted_code) # # Save the redacted code to the output file # with open(args.output_file, "w") as output_file: # output_file.write(redacted_code[0] if isinstance(redacted_code, list) else redacted_code) # logging.info("Redacted code saved successfully.") # if __name__ == "__main__": # main() #REGEX AS For text DETECTION import json import logging import random import os from privacy.util.code_detect.pii_detection import scan_pii_batch from privacy.util.code_detect.pii_redaction import redact_pii_batch, random_replacements class code_detect: def codeDetectRegex(input_code_text): #output_file batch_size=8 seed=0 num_proc=8 no_redaction=False load_replacements=True add_reference_text=True logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", level=logging.INFO) print("input_code_text",input_code_text) # Apply PII detection ds_pii = scan_pii_batch([{"content": input_code_text}]) logging.info(f"PII detection results:\n{ds_pii}") logging.info(f"Number of samples that contained PII: {sum(ds_pii['has_secrets'])}") logging.info(f"Total number of secrets found: {sum(ds_pii['number_secrets'])}") # Redact PII in the code if not no_redaction: logging.info(f" ===== Applying PII redaction =====") random.seed(seed) # Use random replacements by default if load_replacements: with open("privacy/util/code_detect/replacements.json", "r") as f: replacements = json.load(f) else: # Get the path to the directory of the current script current_dir = os.path.dirname(os.path.abspath(__file__)) replacements_file_path = os.path.join(current_dir, "privacy", "util", "code_detect", "replacements.json") print("replacements_file_path",replacements_file_path) replacements = random_replacements() with open(replacements_file_path, "w") as f: json.dump(replacements, f) logging.info(f"Using the following replacements:\n{replacements}") ds_pii_redacted = redact_pii_batch( [{"content": input_code_text, "secrets": ds_pii['secrets'], "has_secrets": ds_pii['has_secrets'], "number_secrets": ds_pii['number_secrets']}], replacements=replacements, add_references=add_reference_text ) redacted_code = ds_pii_redacted["new_content"][0] # Access the redacted code print("Redacted Code:") print(redacted_code) # # Save the redacted code to the output file # with open(output_file, "w") as output_file: # output_file.write(redacted_code[0] if isinstance(redacted_code, list) else redacted_code) logging.info("Redacted code saved successfully.") return redacted_code