Spaces:

Kiran5
/

Privacy

Build error

App Files Files Community

Privacy / src /privacy /util /code_detect /ner /pii_inference /netcustom.py

Kiran5

Track large files and images with Git LFS

54fa0c8 6 months ago

raw

history blame contribute delete

30.5 kB

	# from transformers import pipeline

	# classifier = pipeline("token-classification", model = "bigcode/starpii", aggregation_strategy="simple")
	# classifier("Hello I'm John and my IP address is 196.780.89.78")

	# from transformers import AutoModelForTokenClassification, AutoTokenizer
	# import torch

	# # Load the pre-trained model and tokenizer
	# model_name = "bigcode/starpii"
	# model = AutoModelForTokenClassification.from_pretrained(model_name)
	# tokenizer = AutoTokenizer.from_pretrained(model_name)

	# # Prepare input text
	# text = "from transformers import AutoModelForTokenClassification, AutoTokenizer import torch secretkey= cmVnrGtuOjAxOjE3MjEyODUwMjg6M0RrNjVMVGZEaGd6T0RiZ09FR3M5MEV5Tk0z ipadress= 10.83.73.87.84 email= [email protected]"
	# inputs = tokenizer(text, return_tensors="pt")

	# # Perform inference
	# with torch.no_grad():
	# outputs = model(**inputs)

	# # Get the predicted labels
	# predicted_labels = torch.argmax(outputs.logits, dim=2)
	# labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()]

	# # Print the labels
	# print(labels)

	# from transformers import AutoModelForTokenClassification, AutoTokenizer
	# import torch

	# # Load the pre-trained model and tokenizer
	# model_name = "bigcode/starpii"
	# model = AutoModelForTokenClassification.from_pretrained(model_name)
	# tokenizer = AutoTokenizer.from_pretrained(model_name)

	# # Prepare input text
	# text = "from transformers import AutoModelForTokenClassification, AutoTokenizer import torch secretkey= cmVnrGtuOjAxOjE3MjEyODUwMjg6M0RrNjVMVGZEaGd6T0RiZ09FR3M5MEV5Tk0z ipadress= 10.83.73.87.84 email= [email protected]"
	# inputs = tokenizer(text, return_tensors="pt")

	# # Perform inference
	# with torch.no_grad():
	# outputs = model(**inputs)

	# # Get the predicted labels
	# predicted_labels = torch.argmax(outputs.logits, dim=2)
	# labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()]

	# # Replace IP address with the label or "IP_ADDRESS"
	# output_text = text
	# current_ip = ""
	# for token, label in zip(inputs["input_ids"][0], labels):
	# token_text = tokenizer.decode(token).strip()
	# if label == "B-EMAIL":
	# current_ip += token_text
	# if label == "I-EMAIL":
	# current_ip += token_text
	# elif current_ip:
	# output_text = output_text.replace(current_ip, "EMAILID")
	# current_ip = ""

	# print("output text",output_text)


	## SAVED THE MODEL LOCALLY USING THIS CODE
	## USING THIS CODE TEH HUGGINGFACE MODEL IS SAVED LOCALLY AND USED IN BELOW CODE
	## FOR TEXT AS WELL AS FILE DETECTION
	# from transformers import AutoModelForTokenClassification, AutoTokenizer

	# # Load the pre-trained model and tokenizer
	# model_name = "bigcode/starpii"
	# model = AutoModelForTokenClassification.from_pretrained(model_name)
	# tokenizer = AutoTokenizer.from_pretrained(model_name)

	# # Specify the directory where you want to save the model
	# local_model_directory = "./nermodel"

	# # Save the model and tokenizer to the local directory
	# model.save_pretrained(local_model_directory)
	# tokenizer.save_pretrained(local_model_directory)

	# print(f"Model and tokenizer saved to {local_model_directory}")

	## ABOVE COMMENTED CODE IS FOR REMOVAL!!!

	# NER MODEL DETECTION FOR TEXT
	from transformers import AutoModelForTokenClassification, AutoTokenizer
	import torch
	import os
	import autopep8
	import re
	class code_detect_ner:
	# def textner(text):
	# # Load the model and tokenizer from the local directory
	# local_model_directory = "privacy/util/code_detect/ner/pii_inference/nermodel"
	# model = AutoModelForTokenClassification.from_pretrained(local_model_directory)
	# tokenizer = AutoTokenizer.from_pretrained(local_model_directory)

	# # Prepare input text
	# inputs = tokenizer(text, return_tensors="pt")

	# # Perform inference
	# with torch.no_grad():
	# outputs = model(**inputs)

	# # Get the predicted labels
	# predicted_labels = torch.argmax(outputs.logits, dim=2)
	# labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()]

	# # Define a mapping of entity types to placeholders
	# entity_mapping = {
	# "USERNAME": "<USERNAME>",
	# "EMAIL": "<EMAIL>",
	# "IP_ADDRESS": "<IP_ADDRESS>",
	# "KEY": "<KEY>",
	# }

	# # Initialize variables
	# redacted_text = ""
	# current_entity = None
	# last_token_was_special = False

	# # Redact entities in the original text
	# for token, label in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), labels):
	# if token.startswith("Ġ"):
	# last_token_was_special = True
	# token = token[1:] # Remove the leading "Ġ" character if present
	# else:
	# last_token_was_special = False

	# if label.startswith("B-"):
	# current_entity = label[2:]
	# redacted_text += f" {entity_mapping.get(current_entity, current_entity)}"
	# elif label.startswith("I-") and current_entity is not None:
	# pass # Skip intermediate tokens of the entity
	# else:
	# current_entity = None
	# if last_token_was_special and not token.startswith("Ġ"):
	# redacted_text += " "
	# redacted_text += token

	# redacted_text = redacted_text.replace("Ġ", "")
	# redacted_text = redacted_text.replace("č", "")
	# redacted_text = redacted_text.replace("Ċ", "")
	# # redacted_text = redacted_text.replace("Ċ", "")



	# # Print the redacted text
	# print("Redacted Text:", redacted_text.strip())
	# return redacted_text.strip()


	def textner(text):
	# Load the model and tokenizer from the local directory
	local_model_directory = "privacy/util/code_detect/ner/pii_inference/nermodel"
	model = AutoModelForTokenClassification.from_pretrained(local_model_directory)
	tokenizer = AutoTokenizer.from_pretrained(local_model_directory)
	print("textNER", text)

	# Prepare input text
	inputs = tokenizer(text, return_tensors="pt")

	# Perform inference
	with torch.no_grad():
	outputs = model(**inputs)

	# Get the predicted labels
	predicted_labels = torch.argmax(outputs.logits, dim=2)
	labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()]
	print(predicted_labels,"predicted_labels")
	print("labels",labels)
	# Define a mapping of entity types to placeholders
	entity_mapping = {
	"<USERNAME>": "<USERNAME>",
	"<EMAIL>": "<EMAIL>",
	"<IP_ADDRESS>": "<IP_ADDRESS>",
	"<KEY>": "<KEY>",
	}

	# Initialize variables
	redacted_text = ""
	current_entity = None
	last_token_was_special = False

	# Redact entities in the original text
	for token, label in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), labels):
	if token.startswith("Ġ"):
	last_token_was_special = True
	token = token[1:] # Remove the leading "Ġ" character
	else:
	last_token_was_special = False

	if label.startswith("B-"):
	current_entity = label[2:]
	redacted_text += f"<{entity_mapping.get(current_entity, current_entity)}>"
	elif label.startswith("I-") and current_entity is not None:
	pass # Skip intermediate tokens of the entity
	else:
	current_entity = None
	if last_token_was_special and not token.startswith("Ġ"):
	redacted_text += " "
	redacted_text += token

	# Print the redacted text
	#code_detect_ner.filener("privacy/util/code_detect/ner/pii_inference/input_code.java")
	redacted_text = redacted_text.replace("Ġ", "")
	redacted_text = redacted_text.replace("č", "")
	redacted_text = redacted_text.replace("Ċ", "")
	print("Redacted Text:", redacted_text.strip())
	return redacted_text

	# def filener(input_code_file):
	# ## NER DETECTION FROM FILE BUT FOR BIG CODE!!!!!!!!!!!!!!
	# from transformers import AutoModelForTokenClassification, AutoTokenizer
	# import torch

	# # Load the model and tokenizer from the local directory
	# local_model_directory = "privacy/util/code_detect/ner/pii_inference/nermodel"
	# model = AutoModelForTokenClassification.from_pretrained(local_model_directory)
	# tokenizer = AutoTokenizer.from_pretrained(local_model_directory, model_max_length=10000)

	# # Specify the input code file
	# #input_code_file = "input_code.java"
	# # input_code_file = "input.py"

	# # Read the code from the file
	# with open(input_code_file, "r", encoding="utf-8") as file:
	# code = file.read()
	# #code = input_code_file.file.read()

	# # Define a chunk size (adjust as needed)
	# chunk_size = 1000

	# # Initialize the redacted text
	# redacted_text = ""
	# current_entity = None
	# last_token_was_special = False

	# # Split the code into chunks
	# code_chunks = [code[i:i + chunk_size] for i in range(0, len(code), chunk_size)]

	# # Process each chunk
	# for i, chunk in enumerate(code_chunks):
	# # Prepare input text
	# inputs = tokenizer(chunk, return_tensors="pt")

	# # Perform inference
	# with torch.no_grad():
	# outputs = model(**inputs)

	# # Get the predicted labels
	# predicted_labels = torch.argmax(outputs.logits, dim=2)
	# labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()]

	# # Define a mapping of entity types to placeholders
	# entity_mapping = {
	# "NAME": "<NAME>",
	# "EMAIL": "<EMAIL>",
	# "IP_ADDRESS": "<IP_ADDRESS>",
	# }

	# # Redact entities in the original text
	# for token, label in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), labels):
	# if token.startswith("Ġ"):
	# last_token_was_special = True
	# token = token[1:] # Remove the leading "Ġ" character
	# else:
	# last_token_was_special = False

	# # Add space if the last token was a special token and the current token does not start with "<"
	# if last_token_was_special and not token.startswith("<"):
	# redacted_text += " "

	# if label.startswith("B-"):
	# current_entity = label[2:]
	# redacted_text += f"{entity_mapping.get(current_entity, current_entity)}"
	# elif label.startswith("I-") and current_entity is not None:
	# pass # Skip intermediate tokens of the entity
	# else:
	# current_entity = None
	# redacted_text += token

	# # Split the redacted text into lines and add indentation
	# redacted_lines = redacted_text.split("Ċ")
	# formatted_redacted_text = ""
	# indentation = 0

	# for line in redacted_lines:
	# if "{" in line:
	# formatted_redacted_text += " " * indentation + line + "\n"
	# indentation += 1
	# elif "}" in line:
	# indentation -= 1
	# formatted_redacted_text += " " * indentation + line + "\n"
	# else:
	# formatted_redacted_text += " " * indentation + line + "\n"

	# # Remove any remaining special characters
	# formatted_redacted_text = formatted_redacted_text.replace("Ġ", "")

	# # # Write the redacted code back to the file using UTF-8 encoding
	# # output_code_file = "redacted_code.java"
	# # with open(output_code_file, "a", encoding="utf-8") as file:
	# # file.write(formatted_redacted_text.strip())
	# # Generate the output file name based on the input file name
	# output_code_file = os.path.splitext(input_code_file)[0] + "_redacted" + os.path.splitext(input_code_file)[1]

	# # Write the redacted code back to the file using UTF-8 encoding
	# with open(output_code_file, "w", encoding="utf-8") as file:
	# file.write(formatted_redacted_text.strip())
	# # Delete the temporary input code file
	# os.remove(input_code_file)
	# # Print the final redacted text
	# print("Redacted Text:", formatted_redacted_text.strip())
	# return output_code_file





	# def filener(code_content, filename):
	# # Load the model and tokenizer from the local directory
	# local_model_directory = "privacy/util/code_detect/ner/pii_inference/nermodel"
	# model = AutoModelForTokenClassification.from_pretrained(local_model_directory)
	# tokenizer = AutoTokenizer.from_pretrained(local_model_directory, model_max_length=10000)

	# # Define a chunk size (adjust as needed)
	# chunk_size = 1000

	# # Initialize the redacted text
	# redacted_text = ""
	# current_entity = None
	# last_token_was_special = False

	# # Split the code into chunks
	# code_chunks = [code_content[i:i + chunk_size] for i in range(0, len(code_content), chunk_size)]

	# # Process each chunk
	# for i, chunk in enumerate(code_chunks):
	# # Prepare input text
	# chunk_str = chunk.decode("utf-8")
	# inputs = tokenizer(chunk_str, return_tensors="pt")

	# # Perform inference
	# with torch.no_grad():
	# outputs = model(**inputs)

	# # Get the predicted labels
	# predicted_labels = torch.argmax(outputs.logits, dim=2)
	# labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()]

	# # Define a mapping of entity types to placeholders
	# entity_mapping = {
	# "NAME": "<NAME>",
	# "EMAIL": "<EMAIL>",
	# "IP_ADDRESS": "<IP_ADDRESS>",
	# }

	# # Redact entities in the original text
	# for token, label in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), labels):
	# if token.startswith("Ġ"):
	# last_token_was_special = True
	# token = token[1:] # Remove the leading "Ġ" character
	# else:
	# last_token_was_special = False

	# # Add space if the last token was a special token and the current token does not start with "<"
	# if last_token_was_special and not token.startswith("<"):
	# redacted_text += " "

	# if label.startswith("B-"):
	# current_entity = label[2:]
	# redacted_text += f"{entity_mapping.get(current_entity, current_entity)}"
	# elif label.startswith("I-") and current_entity is not None:
	# pass # Skip intermediate tokens of the entity
	# else:
	# current_entity = None
	# redacted_text += token

	# # Split the redacted text into lines and add indentation
	# redacted_lines = redacted_text.split("Ċ")
	# formatted_redacted_text = ""
	# indentation = 0

	# for line in redacted_lines:
	# if "{" in line:
	# formatted_redacted_text += " " * indentation + line + "\n"
	# indentation += 1
	# elif "}" in line:
	# indentation -= 1
	# formatted_redacted_text += " " * indentation + line + "\n"
	# else:
	# formatted_redacted_text += " " * indentation + line + "\n"

	# # Remove any remaining special characters
	# formatted_redacted_text = formatted_redacted_text.replace("Ġ", "")
	# formatted_redacted_text = formatted_redacted_text.replace("č", "")
	# print("formatted_redacted_text",formatted_redacted_text)
	# # Generate the output file name based on the input file name
	# output_code_file = os.path.splitext(filename)[0] + "_redacted" + os.path.splitext(filename)[1]

	# # Write the redacted code back to the file using UTF-8 encoding
	# with open(output_code_file, "w", encoding="utf-8") as file:
	# file.write(formatted_redacted_text.strip())

	# # Return the redacted text and the output code file name
	# return formatted_redacted_text.strip().encode("utf-8"), output_code_file




	# def filener(code_content, filename):
	# # Load the model and tokenizer from the local directory
	# local_model_directory = "privacy/util/code_detect/ner/pii_inference/nermodel"
	# model = AutoModelForTokenClassification.from_pretrained(local_model_directory)
	# tokenizer = AutoTokenizer.from_pretrained(local_model_directory, model_max_length=10000)

	# # Define a chunk size (adjust as needed)
	# chunk_size = 1000

	# # Initialize the redacted text
	# redacted_text = ""
	# current_entity = None
	# last_token_was_special = False

	# # Split the code into chunks
	# code_chunks = [code_content[i:i + chunk_size] for i in range(0, len(code_content), chunk_size)]

	# # Process each chunk
	# for i, chunk in enumerate(code_chunks):
	# # Prepare input text
	# chunk_str = chunk.decode("utf-8")
	# inputs = tokenizer(chunk_str, return_tensors="pt")

	# # Perform inference
	# with torch.no_grad():
	# outputs = model(**inputs)

	# # Get the predicted labels
	# predicted_labels = torch.argmax(outputs.logits, dim=2)
	# labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()]

	# # Define a mapping of entity types to placeholders
	# entity_mapping = {
	# "NAME": "<NAME>",
	# "EMAIL": "<EMAIL>",
	# "IP_ADDRESS": "<IP_ADDRESS>"
	# }

	# # Redact entities in the original text
	# for token, label in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), labels):
	# if token.startswith("Ġ"):
	# last_token_was_special = True
	# token = token[1:] # Remove the leading "Ġ" character
	# else:
	# last_token_was_special = False

	# # Add space if the last token was a special token and the current token does not start with "<"
	# if last_token_was_special and not token.startswith("<"):
	# redacted_text += " "

	# if label.startswith("B-"):
	# current_entity = label[2:]
	# redacted_text += f"{entity_mapping.get(current_entity, current_entity)}"
	# elif label.startswith("I-") and current_entity is not None:
	# pass # Skip intermediate tokens of the entity
	# else:
	# current_entity = None
	# redacted_text += token

	# # Split the redacted text into lines and add indentation
	# redacted_lines = redacted_text.split("Ċ")
	# formatted_redacted_text = ""
	# indentation = 0

	# for line in redacted_lines:
	# line = line.strip()

	# if line.startswith(" "):
	# formatted_line = " " * indentation + line + "\n"
	# elif line.startswith("#"):
	# formatted_line = " " * indentation + line + "\n"
	# else:
	# formatted_line = line + "\n"

	# # Adjust indentation based on braces
	# if "{" in line:
	# indentation += 1
	# elif "}" in line:
	# indentation = max(0, indentation - 1)

	# formatted_redacted_text += formatted_line

	# # Remove any remaining special characters
	# formatted_redacted_text = formatted_redacted_text.replace("Ġ", "")
	# formatted_redacted_text = formatted_redacted_text.replace("č", "")

	# # Generate the output file name based on the input file name
	# output_code_file = os.path.splitext(filename)[0] + "_redacted" + os.path.splitext(filename)[1]

	# # Write the formatted redacted code back to the file using UTF-8 encoding
	# with open(output_code_file, "w", encoding="utf-8") as file:
	# file.write(formatted_redacted_text.strip())

	# # Use autopep8 to format the code in-place
	# with open(output_code_file, "r", encoding="utf-8") as file:
	# code_content = file.read()

	# formatted_code = autopep8.fix_code(
	# code_content,
	# options={
	# 'aggressive': 1,
	# 'max_line_length': 120, # Adjust this based on your desired line length
	# }
	# )

	# # Write the formatted code back
	# with open(output_code_file, "w", encoding="utf-8") as file:
	# file.write(formatted_code)

	# print("FORMCODE","\n",formatted_code)
	# # Return the redacted text and the output code file name
	# return formatted_code.encode("utf-8"), output_code_file

	def filener(code_content, filename,model,tokenizer):
	# Load the model and tokenizer from the local directory
	# local_model_directory = "privacy/util/code_detect/ner/pii_inference/nermodel"
	# model = AutoModelForTokenClassification.from_pretrained(local_model_directory)
	# tokenizer = AutoTokenizer.from_pretrained(local_model_directory, model_max_length=10000)

	# Define a chunk size (adjust as needed)
	chunk_size = 1000
	# Initialize the redacted text
	redacted_text = ""
	current_entity = None
	last_token_was_special = False

	# Split the code into chunks
	code_chunks = [code_content[i:i + chunk_size] for i in range(0, len(code_content), chunk_size)]

	# Process each chunk
	for i, chunk in enumerate(code_chunks):
	# Prepare input text
	chunk_str = chunk.decode("utf-8")
	inputs = tokenizer(chunk_str, return_tensors="pt")

	# Perform inference
	with torch.no_grad():
	outputs = model(**inputs)

	# Get the predicted labels
	predicted_labels = torch.argmax(outputs.logits, dim=2)
	labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()]
	# Define a mapping of entity types to placeholders
	entity_mapping = {
	"USERNAME": "<USERNAME>",
	"EMAIL": "<EMAIL>",
	"IP_ADDRESS": "<IP_ADDRESS>",
	"KEY": "<KEY>",
	"NAME": "<NAME>"
	}

	# Redact entities in the original text
	for token, label in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), labels):
	if token.startswith("Ġ"):
	last_token_was_special = True
	token = token[1:] # Remove the leading "Ġ" character
	else:
	last_token_was_special = False

	# Add space if the last token was a special token and the current token does not start with "<"
	if last_token_was_special and not token.startswith("<"):
	redacted_text += " "

	if label.startswith("B-"):
	current_entity = label[2:]
	redacted_text += f"{entity_mapping.get(current_entity, current_entity)}"
	elif label.startswith("I-") and current_entity is not None:
	pass # Skip intermediate tokens of the entity
	else:
	current_entity = None
	redacted_text += token

	# Split the redacted text into lines and add indentation
	redacted_lines = redacted_text.split("Ċ")
	formatted_redacted_text = ""
	indentation = 0

	for line in redacted_lines:
	print("line--",line +"\n")
	line = line.strip()

	if line.startswith(" "):
	formatted_line = " " * indentation + line + "\n"
	elif line.startswith('Ġ'):
	formatted_line = " " + line + "\n"
	elif line.startswith('ĉ'):
	formatted_line = " " + line + "\n"
	elif line.startswith("#"):
	formatted_line = " " * indentation + line + "\n"
	else:
	formatted_line = " " + line + "\n"
	print("--formatted line--",formatted_line)
	# Adjust indentation based on braces
	if "{" in line:
	indentation += 1
	elif "}" in line:
	indentation = max(0, indentation - 1)

	# Check if the line ends with a colon, indicating the start of a block
	if line.endswith(":"):
	indentation += 1

	formatted_redacted_text += formatted_line
	# Remove any remaining special characters

	formatted_redacted_text = formatted_redacted_text.replace("Ġ", " ")
	print("to be removed chars--",formatted_redacted_text)

	formatted_redacted_text = formatted_redacted_text.replace("č", " ")
	formatted_redacted_text = formatted_redacted_text.replace("ĉ", " ")
	redacted_text = formatted_redacted_text.replace("Ċ", " ")
	#print("formatted text",formatted_redacted_text)
	# Generate the output file name based on the input file name
	output_code_file = os.path.splitext(filename)[0] + "_redacted" + os.path.splitext(filename)[1]

	# Write the formatted redacted code back to the file using UTF-8 encoding
	with open(output_code_file, "w", encoding="utf-8") as file:
	file.write(formatted_redacted_text.strip())

	# Use autopep8 to format the code in-place
	with open(output_code_file, "r", encoding="utf-8") as file:
	code_content = file.read()

	formatted_code = autopep8.fix_code(
	code_content,
	options={
	'aggressive': 1,
	'max_line_length': 120, # Adjust this based on your desired line length
	}
	)

	# Write the formatted code back
	with open(output_code_file, "w", encoding="utf-8") as file:
	file.write(formatted_code)
	# print("FORMATTED CODE","\n", formatted_code)
	# Return the redacted text and the output code file name
	return formatted_code.encode("utf-8"), output_code_file




	## FOR FILE WORKING
	# from transformers import AutoModelForTokenClassification, AutoTokenizer
	# import torch

	# # Load the model and tokenizer from the local directory
	# local_model_directory = "./nermodel"
	# model = AutoModelForTokenClassification.from_pretrained(local_model_directory)
	# tokenizer = AutoTokenizer.from_pretrained(local_model_directory,model_max_length=10000)

	# # Specify the input code file
	# input_code_file = "input_code.java"
	# #input_code_file = "input.py"
	# # Read the code from the file
	# with open(input_code_file, "r", encoding="utf-8") as file:
	# code = file.read()

	# # Prepare input text
	# inputs = tokenizer(code, return_tensors="pt")
	# # print("INPUT IDS",inputs["input_ids"].shape)
	# # print("MODEL CONFIG",model.config)
	# # print("TOKENIZER",tokenizer)
	# # Perform inference
	# with torch.no_grad():
	# outputs = model(**inputs)

	# # Get the predicted labels
	# predicted_labels = torch.argmax(outputs.logits, dim=2)
	# labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()]

	# # Define a mapping of entity types to placeholders
	# entity_mapping = {
	# "NAME": "<NAME>",
	# "EMAIL": "<EMAIL>",
	# "IP_ADDRESS": "<IP_ADDRESS>",
	# }

	# # Initialize variables
	# redacted_text = ""
	# current_entity = None
	# last_token_was_special = False

	# # Redact entities in the original text
	# for token, label in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), labels):
	# if token.startswith("Ġ"):
	# last_token_was_special = True
	# token = token[1:] # Remove the leading "Ġ" character
	# else:
	# last_token_was_special = False

	# # Add space if the last token was a special token and the current token does not start with "<"
	# if last_token_was_special and not token.startswith("<"):
	# redacted_text += " "

	# if label.startswith("B-"):
	# current_entity = label[2:]
	# redacted_text += f"{entity_mapping.get(current_entity, current_entity)}"
	# elif label.startswith("I-") and current_entity is not None:
	# pass # Skip intermediate tokens of the entity
	# else:
	# current_entity = None
	# redacted_text += token


	# # Split the redacted text into lines and add indentation
	# redacted_lines = redacted_text.split("Ċ")
	# formatted_redacted_text = ""
	# indentation = 0

	# for line in redacted_lines:
	# if "{" in line:
	# formatted_redacted_text += " " * indentation + line + "\n"
	# indentation += 1
	# elif "}" in line:
	# indentation -= 1
	# formatted_redacted_text += " " * indentation + line + "\n"
	# else:
	# formatted_redacted_text += " " * indentation + line + "\n"

	# # Remove any remaining special characters
	# formatted_redacted_text = formatted_redacted_text.replace("Ġ", "")

	# # Write the redacted code back to the file using UTF-8 encoding
	# output_code_file = "redacted_code.java"
	# #output_code_file = "x.py"
	# with open(output_code_file, "w", encoding="utf-8") as file:
	# file.write(formatted_redacted_text.strip())

	# # Print the redacted text
	# print("Redacted Text:", formatted_redacted_text.strip())