Kiran5's picture
Track large files and images with Git LFS
54fa0c8
# from transformers import pipeline
# classifier = pipeline("token-classification", model = "bigcode/starpii", aggregation_strategy="simple")
# classifier("Hello I'm John and my IP address is 196.780.89.78")
# from transformers import AutoModelForTokenClassification, AutoTokenizer
# import torch
# # Load the pre-trained model and tokenizer
# model_name = "bigcode/starpii"
# model = AutoModelForTokenClassification.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# # Prepare input text
# text = "from transformers import AutoModelForTokenClassification, AutoTokenizer import torch secretkey= cmVnrGtuOjAxOjE3MjEyODUwMjg6M0RrNjVMVGZEaGd6T0RiZ09FR3M5MEV5Tk0z ipadress= 10.83.73.87.84 email= [email protected]"
# inputs = tokenizer(text, return_tensors="pt")
# # Perform inference
# with torch.no_grad():
# outputs = model(**inputs)
# # Get the predicted labels
# predicted_labels = torch.argmax(outputs.logits, dim=2)
# labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()]
# # Print the labels
# print(labels)
# from transformers import AutoModelForTokenClassification, AutoTokenizer
# import torch
# # Load the pre-trained model and tokenizer
# model_name = "bigcode/starpii"
# model = AutoModelForTokenClassification.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# # Prepare input text
# text = "from transformers import AutoModelForTokenClassification, AutoTokenizer import torch secretkey= cmVnrGtuOjAxOjE3MjEyODUwMjg6M0RrNjVMVGZEaGd6T0RiZ09FR3M5MEV5Tk0z ipadress= 10.83.73.87.84 email= [email protected]"
# inputs = tokenizer(text, return_tensors="pt")
# # Perform inference
# with torch.no_grad():
# outputs = model(**inputs)
# # Get the predicted labels
# predicted_labels = torch.argmax(outputs.logits, dim=2)
# labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()]
# # Replace IP address with the label or "IP_ADDRESS"
# output_text = text
# current_ip = ""
# for token, label in zip(inputs["input_ids"][0], labels):
# token_text = tokenizer.decode(token).strip()
# if label == "B-EMAIL":
# current_ip += token_text
# if label == "I-EMAIL":
# current_ip += token_text
# elif current_ip:
# output_text = output_text.replace(current_ip, "EMAILID")
# current_ip = ""
# print("output text",output_text)
## SAVED THE MODEL LOCALLY USING THIS CODE
## USING THIS CODE TEH HUGGINGFACE MODEL IS SAVED LOCALLY AND USED IN BELOW CODE
## FOR TEXT AS WELL AS FILE DETECTION
# from transformers import AutoModelForTokenClassification, AutoTokenizer
# # Load the pre-trained model and tokenizer
# model_name = "bigcode/starpii"
# model = AutoModelForTokenClassification.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# # Specify the directory where you want to save the model
# local_model_directory = "./nermodel"
# # Save the model and tokenizer to the local directory
# model.save_pretrained(local_model_directory)
# tokenizer.save_pretrained(local_model_directory)
# print(f"Model and tokenizer saved to {local_model_directory}")
## ABOVE COMMENTED CODE IS FOR REMOVAL!!!
# NER MODEL DETECTION FOR TEXT
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch
import os
import autopep8
import re
class code_detect_ner:
# def textner(text):
# # Load the model and tokenizer from the local directory
# local_model_directory = "privacy/util/code_detect/ner/pii_inference/nermodel"
# model = AutoModelForTokenClassification.from_pretrained(local_model_directory)
# tokenizer = AutoTokenizer.from_pretrained(local_model_directory)
# # Prepare input text
# inputs = tokenizer(text, return_tensors="pt")
# # Perform inference
# with torch.no_grad():
# outputs = model(**inputs)
# # Get the predicted labels
# predicted_labels = torch.argmax(outputs.logits, dim=2)
# labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()]
# # Define a mapping of entity types to placeholders
# entity_mapping = {
# "USERNAME": "<USERNAME>",
# "EMAIL": "<EMAIL>",
# "IP_ADDRESS": "<IP_ADDRESS>",
# "KEY": "<KEY>",
# }
# # Initialize variables
# redacted_text = ""
# current_entity = None
# last_token_was_special = False
# # Redact entities in the original text
# for token, label in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), labels):
# if token.startswith("Ġ"):
# last_token_was_special = True
# token = token[1:] # Remove the leading "Ġ" character if present
# else:
# last_token_was_special = False
# if label.startswith("B-"):
# current_entity = label[2:]
# redacted_text += f" {entity_mapping.get(current_entity, current_entity)}"
# elif label.startswith("I-") and current_entity is not None:
# pass # Skip intermediate tokens of the entity
# else:
# current_entity = None
# if last_token_was_special and not token.startswith("Ġ"):
# redacted_text += " "
# redacted_text += token
# redacted_text = redacted_text.replace("Ġ", "")
# redacted_text = redacted_text.replace("č", "")
# redacted_text = redacted_text.replace("Ċ", "")
# # redacted_text = redacted_text.replace("Ċ", "")
# # Print the redacted text
# print("Redacted Text:", redacted_text.strip())
# return redacted_text.strip()
def textner(text):
# Load the model and tokenizer from the local directory
local_model_directory = "privacy/util/code_detect/ner/pii_inference/nermodel"
model = AutoModelForTokenClassification.from_pretrained(local_model_directory)
tokenizer = AutoTokenizer.from_pretrained(local_model_directory)
print("textNER", text)
# Prepare input text
inputs = tokenizer(text, return_tensors="pt")
# Perform inference
with torch.no_grad():
outputs = model(**inputs)
# Get the predicted labels
predicted_labels = torch.argmax(outputs.logits, dim=2)
labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()]
print(predicted_labels,"predicted_labels")
print("labels",labels)
# Define a mapping of entity types to placeholders
entity_mapping = {
"<USERNAME>": "<USERNAME>",
"<EMAIL>": "<EMAIL>",
"<IP_ADDRESS>": "<IP_ADDRESS>",
"<KEY>": "<KEY>",
}
# Initialize variables
redacted_text = ""
current_entity = None
last_token_was_special = False
# Redact entities in the original text
for token, label in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), labels):
if token.startswith("Ġ"):
last_token_was_special = True
token = token[1:] # Remove the leading "Ġ" character
else:
last_token_was_special = False
if label.startswith("B-"):
current_entity = label[2:]
redacted_text += f"<{entity_mapping.get(current_entity, current_entity)}>"
elif label.startswith("I-") and current_entity is not None:
pass # Skip intermediate tokens of the entity
else:
current_entity = None
if last_token_was_special and not token.startswith("Ġ"):
redacted_text += " "
redacted_text += token
# Print the redacted text
#code_detect_ner.filener("privacy/util/code_detect/ner/pii_inference/input_code.java")
redacted_text = redacted_text.replace("Ġ", "")
redacted_text = redacted_text.replace("č", "")
redacted_text = redacted_text.replace("Ċ", "")
print("Redacted Text:", redacted_text.strip())
return redacted_text
# def filener(input_code_file):
# ## NER DETECTION FROM FILE BUT FOR BIG CODE!!!!!!!!!!!!!!
# from transformers import AutoModelForTokenClassification, AutoTokenizer
# import torch
# # Load the model and tokenizer from the local directory
# local_model_directory = "privacy/util/code_detect/ner/pii_inference/nermodel"
# model = AutoModelForTokenClassification.from_pretrained(local_model_directory)
# tokenizer = AutoTokenizer.from_pretrained(local_model_directory, model_max_length=10000)
# # Specify the input code file
# #input_code_file = "input_code.java"
# # input_code_file = "input.py"
# # Read the code from the file
# with open(input_code_file, "r", encoding="utf-8") as file:
# code = file.read()
# #code = input_code_file.file.read()
# # Define a chunk size (adjust as needed)
# chunk_size = 1000
# # Initialize the redacted text
# redacted_text = ""
# current_entity = None
# last_token_was_special = False
# # Split the code into chunks
# code_chunks = [code[i:i + chunk_size] for i in range(0, len(code), chunk_size)]
# # Process each chunk
# for i, chunk in enumerate(code_chunks):
# # Prepare input text
# inputs = tokenizer(chunk, return_tensors="pt")
# # Perform inference
# with torch.no_grad():
# outputs = model(**inputs)
# # Get the predicted labels
# predicted_labels = torch.argmax(outputs.logits, dim=2)
# labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()]
# # Define a mapping of entity types to placeholders
# entity_mapping = {
# "NAME": "<NAME>",
# "EMAIL": "<EMAIL>",
# "IP_ADDRESS": "<IP_ADDRESS>",
# }
# # Redact entities in the original text
# for token, label in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), labels):
# if token.startswith("Ġ"):
# last_token_was_special = True
# token = token[1:] # Remove the leading "Ġ" character
# else:
# last_token_was_special = False
# # Add space if the last token was a special token and the current token does not start with "<"
# if last_token_was_special and not token.startswith("<"):
# redacted_text += " "
# if label.startswith("B-"):
# current_entity = label[2:]
# redacted_text += f"{entity_mapping.get(current_entity, current_entity)}"
# elif label.startswith("I-") and current_entity is not None:
# pass # Skip intermediate tokens of the entity
# else:
# current_entity = None
# redacted_text += token
# # Split the redacted text into lines and add indentation
# redacted_lines = redacted_text.split("Ċ")
# formatted_redacted_text = ""
# indentation = 0
# for line in redacted_lines:
# if "{" in line:
# formatted_redacted_text += " " * indentation + line + "\n"
# indentation += 1
# elif "}" in line:
# indentation -= 1
# formatted_redacted_text += " " * indentation + line + "\n"
# else:
# formatted_redacted_text += " " * indentation + line + "\n"
# # Remove any remaining special characters
# formatted_redacted_text = formatted_redacted_text.replace("Ġ", "")
# # # Write the redacted code back to the file using UTF-8 encoding
# # output_code_file = "redacted_code.java"
# # with open(output_code_file, "a", encoding="utf-8") as file:
# # file.write(formatted_redacted_text.strip())
# # Generate the output file name based on the input file name
# output_code_file = os.path.splitext(input_code_file)[0] + "_redacted" + os.path.splitext(input_code_file)[1]
# # Write the redacted code back to the file using UTF-8 encoding
# with open(output_code_file, "w", encoding="utf-8") as file:
# file.write(formatted_redacted_text.strip())
# # Delete the temporary input code file
# os.remove(input_code_file)
# # Print the final redacted text
# print("Redacted Text:", formatted_redacted_text.strip())
# return output_code_file
# def filener(code_content, filename):
# # Load the model and tokenizer from the local directory
# local_model_directory = "privacy/util/code_detect/ner/pii_inference/nermodel"
# model = AutoModelForTokenClassification.from_pretrained(local_model_directory)
# tokenizer = AutoTokenizer.from_pretrained(local_model_directory, model_max_length=10000)
# # Define a chunk size (adjust as needed)
# chunk_size = 1000
# # Initialize the redacted text
# redacted_text = ""
# current_entity = None
# last_token_was_special = False
# # Split the code into chunks
# code_chunks = [code_content[i:i + chunk_size] for i in range(0, len(code_content), chunk_size)]
# # Process each chunk
# for i, chunk in enumerate(code_chunks):
# # Prepare input text
# chunk_str = chunk.decode("utf-8")
# inputs = tokenizer(chunk_str, return_tensors="pt")
# # Perform inference
# with torch.no_grad():
# outputs = model(**inputs)
# # Get the predicted labels
# predicted_labels = torch.argmax(outputs.logits, dim=2)
# labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()]
# # Define a mapping of entity types to placeholders
# entity_mapping = {
# "NAME": "<NAME>",
# "EMAIL": "<EMAIL>",
# "IP_ADDRESS": "<IP_ADDRESS>",
# }
# # Redact entities in the original text
# for token, label in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), labels):
# if token.startswith("Ġ"):
# last_token_was_special = True
# token = token[1:] # Remove the leading "Ġ" character
# else:
# last_token_was_special = False
# # Add space if the last token was a special token and the current token does not start with "<"
# if last_token_was_special and not token.startswith("<"):
# redacted_text += " "
# if label.startswith("B-"):
# current_entity = label[2:]
# redacted_text += f"{entity_mapping.get(current_entity, current_entity)}"
# elif label.startswith("I-") and current_entity is not None:
# pass # Skip intermediate tokens of the entity
# else:
# current_entity = None
# redacted_text += token
# # Split the redacted text into lines and add indentation
# redacted_lines = redacted_text.split("Ċ")
# formatted_redacted_text = ""
# indentation = 0
# for line in redacted_lines:
# if "{" in line:
# formatted_redacted_text += " " * indentation + line + "\n"
# indentation += 1
# elif "}" in line:
# indentation -= 1
# formatted_redacted_text += " " * indentation + line + "\n"
# else:
# formatted_redacted_text += " " * indentation + line + "\n"
# # Remove any remaining special characters
# formatted_redacted_text = formatted_redacted_text.replace("Ġ", "")
# formatted_redacted_text = formatted_redacted_text.replace("č", "")
# print("formatted_redacted_text",formatted_redacted_text)
# # Generate the output file name based on the input file name
# output_code_file = os.path.splitext(filename)[0] + "_redacted" + os.path.splitext(filename)[1]
# # Write the redacted code back to the file using UTF-8 encoding
# with open(output_code_file, "w", encoding="utf-8") as file:
# file.write(formatted_redacted_text.strip())
# # Return the redacted text and the output code file name
# return formatted_redacted_text.strip().encode("utf-8"), output_code_file
# def filener(code_content, filename):
# # Load the model and tokenizer from the local directory
# local_model_directory = "privacy/util/code_detect/ner/pii_inference/nermodel"
# model = AutoModelForTokenClassification.from_pretrained(local_model_directory)
# tokenizer = AutoTokenizer.from_pretrained(local_model_directory, model_max_length=10000)
# # Define a chunk size (adjust as needed)
# chunk_size = 1000
# # Initialize the redacted text
# redacted_text = ""
# current_entity = None
# last_token_was_special = False
# # Split the code into chunks
# code_chunks = [code_content[i:i + chunk_size] for i in range(0, len(code_content), chunk_size)]
# # Process each chunk
# for i, chunk in enumerate(code_chunks):
# # Prepare input text
# chunk_str = chunk.decode("utf-8")
# inputs = tokenizer(chunk_str, return_tensors="pt")
# # Perform inference
# with torch.no_grad():
# outputs = model(**inputs)
# # Get the predicted labels
# predicted_labels = torch.argmax(outputs.logits, dim=2)
# labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()]
# # Define a mapping of entity types to placeholders
# entity_mapping = {
# "NAME": "<NAME>",
# "EMAIL": "<EMAIL>",
# "IP_ADDRESS": "<IP_ADDRESS>"
# }
# # Redact entities in the original text
# for token, label in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), labels):
# if token.startswith("Ġ"):
# last_token_was_special = True
# token = token[1:] # Remove the leading "Ġ" character
# else:
# last_token_was_special = False
# # Add space if the last token was a special token and the current token does not start with "<"
# if last_token_was_special and not token.startswith("<"):
# redacted_text += " "
# if label.startswith("B-"):
# current_entity = label[2:]
# redacted_text += f"{entity_mapping.get(current_entity, current_entity)}"
# elif label.startswith("I-") and current_entity is not None:
# pass # Skip intermediate tokens of the entity
# else:
# current_entity = None
# redacted_text += token
# # Split the redacted text into lines and add indentation
# redacted_lines = redacted_text.split("Ċ")
# formatted_redacted_text = ""
# indentation = 0
# for line in redacted_lines:
# line = line.strip()
# if line.startswith(" "):
# formatted_line = " " * indentation + line + "\n"
# elif line.startswith("#"):
# formatted_line = " " * indentation + line + "\n"
# else:
# formatted_line = line + "\n"
# # Adjust indentation based on braces
# if "{" in line:
# indentation += 1
# elif "}" in line:
# indentation = max(0, indentation - 1)
# formatted_redacted_text += formatted_line
# # Remove any remaining special characters
# formatted_redacted_text = formatted_redacted_text.replace("Ġ", "")
# formatted_redacted_text = formatted_redacted_text.replace("č", "")
# # Generate the output file name based on the input file name
# output_code_file = os.path.splitext(filename)[0] + "_redacted" + os.path.splitext(filename)[1]
# # Write the formatted redacted code back to the file using UTF-8 encoding
# with open(output_code_file, "w", encoding="utf-8") as file:
# file.write(formatted_redacted_text.strip())
# # Use autopep8 to format the code in-place
# with open(output_code_file, "r", encoding="utf-8") as file:
# code_content = file.read()
# formatted_code = autopep8.fix_code(
# code_content,
# options={
# 'aggressive': 1,
# 'max_line_length': 120, # Adjust this based on your desired line length
# }
# )
# # Write the formatted code back
# with open(output_code_file, "w", encoding="utf-8") as file:
# file.write(formatted_code)
# print("FORMCODE","\n",formatted_code)
# # Return the redacted text and the output code file name
# return formatted_code.encode("utf-8"), output_code_file
def filener(code_content, filename,model,tokenizer):
# Load the model and tokenizer from the local directory
# local_model_directory = "privacy/util/code_detect/ner/pii_inference/nermodel"
# model = AutoModelForTokenClassification.from_pretrained(local_model_directory)
# tokenizer = AutoTokenizer.from_pretrained(local_model_directory, model_max_length=10000)
# Define a chunk size (adjust as needed)
chunk_size = 1000
# Initialize the redacted text
redacted_text = ""
current_entity = None
last_token_was_special = False
# Split the code into chunks
code_chunks = [code_content[i:i + chunk_size] for i in range(0, len(code_content), chunk_size)]
# Process each chunk
for i, chunk in enumerate(code_chunks):
# Prepare input text
chunk_str = chunk.decode("utf-8")
inputs = tokenizer(chunk_str, return_tensors="pt")
# Perform inference
with torch.no_grad():
outputs = model(**inputs)
# Get the predicted labels
predicted_labels = torch.argmax(outputs.logits, dim=2)
labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()]
# Define a mapping of entity types to placeholders
entity_mapping = {
"USERNAME": "<USERNAME>",
"EMAIL": "<EMAIL>",
"IP_ADDRESS": "<IP_ADDRESS>",
"KEY": "<KEY>",
"NAME": "<NAME>"
}
# Redact entities in the original text
for token, label in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), labels):
if token.startswith("Ġ"):
last_token_was_special = True
token = token[1:] # Remove the leading "Ġ" character
else:
last_token_was_special = False
# Add space if the last token was a special token and the current token does not start with "<"
if last_token_was_special and not token.startswith("<"):
redacted_text += " "
if label.startswith("B-"):
current_entity = label[2:]
redacted_text += f"{entity_mapping.get(current_entity, current_entity)}"
elif label.startswith("I-") and current_entity is not None:
pass # Skip intermediate tokens of the entity
else:
current_entity = None
redacted_text += token
# Split the redacted text into lines and add indentation
redacted_lines = redacted_text.split("Ċ")
formatted_redacted_text = ""
indentation = 0
for line in redacted_lines:
print("line--",line +"\n")
line = line.strip()
if line.startswith(" "):
formatted_line = " " * indentation + line + "\n"
elif line.startswith('Ġ'):
formatted_line = " " + line + "\n"
elif line.startswith('ĉ'):
formatted_line = " " + line + "\n"
elif line.startswith("#"):
formatted_line = " " * indentation + line + "\n"
else:
formatted_line = " " + line + "\n"
print("--formatted line--",formatted_line)
# Adjust indentation based on braces
if "{" in line:
indentation += 1
elif "}" in line:
indentation = max(0, indentation - 1)
# Check if the line ends with a colon, indicating the start of a block
if line.endswith(":"):
indentation += 1
formatted_redacted_text += formatted_line
# Remove any remaining special characters
formatted_redacted_text = formatted_redacted_text.replace("Ġ", " ")
print("to be removed chars--",formatted_redacted_text)
formatted_redacted_text = formatted_redacted_text.replace("č", " ")
formatted_redacted_text = formatted_redacted_text.replace("ĉ", " ")
redacted_text = formatted_redacted_text.replace("Ċ", " ")
#print("formatted text",formatted_redacted_text)
# Generate the output file name based on the input file name
output_code_file = os.path.splitext(filename)[0] + "_redacted" + os.path.splitext(filename)[1]
# Write the formatted redacted code back to the file using UTF-8 encoding
with open(output_code_file, "w", encoding="utf-8") as file:
file.write(formatted_redacted_text.strip())
# Use autopep8 to format the code in-place
with open(output_code_file, "r", encoding="utf-8") as file:
code_content = file.read()
formatted_code = autopep8.fix_code(
code_content,
options={
'aggressive': 1,
'max_line_length': 120, # Adjust this based on your desired line length
}
)
# Write the formatted code back
with open(output_code_file, "w", encoding="utf-8") as file:
file.write(formatted_code)
# print("FORMATTED CODE","\n", formatted_code)
# Return the redacted text and the output code file name
return formatted_code.encode("utf-8"), output_code_file
## FOR FILE WORKING
# from transformers import AutoModelForTokenClassification, AutoTokenizer
# import torch
# # Load the model and tokenizer from the local directory
# local_model_directory = "./nermodel"
# model = AutoModelForTokenClassification.from_pretrained(local_model_directory)
# tokenizer = AutoTokenizer.from_pretrained(local_model_directory,model_max_length=10000)
# # Specify the input code file
# input_code_file = "input_code.java"
# #input_code_file = "input.py"
# # Read the code from the file
# with open(input_code_file, "r", encoding="utf-8") as file:
# code = file.read()
# # Prepare input text
# inputs = tokenizer(code, return_tensors="pt")
# # print("INPUT IDS",inputs["input_ids"].shape)
# # print("MODEL CONFIG",model.config)
# # print("TOKENIZER",tokenizer)
# # Perform inference
# with torch.no_grad():
# outputs = model(**inputs)
# # Get the predicted labels
# predicted_labels = torch.argmax(outputs.logits, dim=2)
# labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()]
# # Define a mapping of entity types to placeholders
# entity_mapping = {
# "NAME": "<NAME>",
# "EMAIL": "<EMAIL>",
# "IP_ADDRESS": "<IP_ADDRESS>",
# }
# # Initialize variables
# redacted_text = ""
# current_entity = None
# last_token_was_special = False
# # Redact entities in the original text
# for token, label in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), labels):
# if token.startswith("Ġ"):
# last_token_was_special = True
# token = token[1:] # Remove the leading "Ġ" character
# else:
# last_token_was_special = False
# # Add space if the last token was a special token and the current token does not start with "<"
# if last_token_was_special and not token.startswith("<"):
# redacted_text += " "
# if label.startswith("B-"):
# current_entity = label[2:]
# redacted_text += f"{entity_mapping.get(current_entity, current_entity)}"
# elif label.startswith("I-") and current_entity is not None:
# pass # Skip intermediate tokens of the entity
# else:
# current_entity = None
# redacted_text += token
# # Split the redacted text into lines and add indentation
# redacted_lines = redacted_text.split("Ċ")
# formatted_redacted_text = ""
# indentation = 0
# for line in redacted_lines:
# if "{" in line:
# formatted_redacted_text += " " * indentation + line + "\n"
# indentation += 1
# elif "}" in line:
# indentation -= 1
# formatted_redacted_text += " " * indentation + line + "\n"
# else:
# formatted_redacted_text += " " * indentation + line + "\n"
# # Remove any remaining special characters
# formatted_redacted_text = formatted_redacted_text.replace("Ġ", "")
# # Write the redacted code back to the file using UTF-8 encoding
# output_code_file = "redacted_code.java"
# #output_code_file = "x.py"
# with open(output_code_file, "w", encoding="utf-8") as file:
# file.write(formatted_redacted_text.strip())
# # Print the redacted text
# print("Redacted Text:", formatted_redacted_text.strip())