Spaces:

Kiran5
/

Privacy

Build error

File size: 30,526 Bytes

54fa0c8

# from transformers import pipeline

# classifier = pipeline("token-classification", model = "bigcode/starpii", aggregation_strategy="simple")
# classifier("Hello I'm John and my IP address is 196.780.89.78")

# from transformers import AutoModelForTokenClassification, AutoTokenizer
# import torch

# # Load the pre-trained model and tokenizer
# model_name = "bigcode/starpii"
# model = AutoModelForTokenClassification.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name)

# # Prepare input text
# text = "from transformers import AutoModelForTokenClassification, AutoTokenizer import torch secretkey= cmVnrGtuOjAxOjE3MjEyODUwMjg6M0RrNjVMVGZEaGd6T0RiZ09FR3M5MEV5Tk0z ipadress= 10.83.73.87.84 email= [email protected]"
# inputs = tokenizer(text, return_tensors="pt")

# # Perform inference
# with torch.no_grad():
#     outputs = model(**inputs)

# # Get the predicted labels
# predicted_labels = torch.argmax(outputs.logits, dim=2)
# labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()]

# # Print the labels
# print(labels)

# from transformers import AutoModelForTokenClassification, AutoTokenizer
# import torch

# # Load the pre-trained model and tokenizer
# model_name = "bigcode/starpii"
# model = AutoModelForTokenClassification.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name)

# # Prepare input text
# text = "from transformers import AutoModelForTokenClassification, AutoTokenizer import torch secretkey= cmVnrGtuOjAxOjE3MjEyODUwMjg6M0RrNjVMVGZEaGd6T0RiZ09FR3M5MEV5Tk0z ipadress= 10.83.73.87.84 email= [email protected]"
# inputs = tokenizer(text, return_tensors="pt")

# # Perform inference
# with torch.no_grad():
#     outputs = model(**inputs)

# # Get the predicted labels
# predicted_labels = torch.argmax(outputs.logits, dim=2)
# labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()]

# # Replace IP address with the label or "IP_ADDRESS"
# output_text = text
# current_ip = ""
# for token, label in zip(inputs["input_ids"][0], labels):
#     token_text = tokenizer.decode(token).strip()
#     if label == "B-EMAIL":
#         current_ip += token_text
#     if label == "I-EMAIL":
#         current_ip += token_text
#     elif current_ip:
#         output_text = output_text.replace(current_ip, "EMAILID")
#         current_ip = ""

# print("output text",output_text)


## SAVED THE MODEL LOCALLY USING THIS CODE
## USING THIS CODE TEH HUGGINGFACE MODEL IS SAVED LOCALLY AND USED IN BELOW CODE 
## FOR TEXT AS WELL AS FILE DETECTION
# from transformers import AutoModelForTokenClassification, AutoTokenizer

# # Load the pre-trained model and tokenizer
# model_name = "bigcode/starpii"
# model = AutoModelForTokenClassification.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name)

# # Specify the directory where you want to save the model
# local_model_directory = "./nermodel"

# # Save the model and tokenizer to the local directory
# model.save_pretrained(local_model_directory)
# tokenizer.save_pretrained(local_model_directory)

# print(f"Model and tokenizer saved to {local_model_directory}")

## ABOVE COMMENTED CODE IS FOR REMOVAL!!!

# NER MODEL DETECTION FOR TEXT
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch
import os
import autopep8
import re
class code_detect_ner:
    # def textner(text):
    #     # Load the model and tokenizer from the local directory
    #     local_model_directory = "privacy/util/code_detect/ner/pii_inference/nermodel"
    #     model = AutoModelForTokenClassification.from_pretrained(local_model_directory)
    #     tokenizer = AutoTokenizer.from_pretrained(local_model_directory)

    #     # Prepare input text
    #     inputs = tokenizer(text, return_tensors="pt")

    #     # Perform inference
    #     with torch.no_grad():
    #         outputs = model(**inputs)

    #     # Get the predicted labels
    #     predicted_labels = torch.argmax(outputs.logits, dim=2)
    #     labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()]

    #     # Define a mapping of entity types to placeholders
    #     entity_mapping = {
    #         "USERNAME": "<USERNAME>",
    #         "EMAIL": "<EMAIL>",
    #         "IP_ADDRESS": "<IP_ADDRESS>",
    #         "KEY": "<KEY>",
    #     }

    #     # Initialize variables
    #     redacted_text = ""
    #     current_entity = None
    #     last_token_was_special = False

    #     # Redact entities in the original text
    #     for token, label in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), labels):
    #         if token.startswith("Ġ"):
    #             last_token_was_special = True
    #             token = token[1:]  # Remove the leading "Ġ" character if present
    #         else:
    #             last_token_was_special = False

    #         if label.startswith("B-"):
    #             current_entity = label[2:]
    #             redacted_text += f" {entity_mapping.get(current_entity, current_entity)}"
    #         elif label.startswith("I-") and current_entity is not None:
    #             pass  # Skip intermediate tokens of the entity
    #         else:
    #             current_entity = None
    #             if last_token_was_special and not token.startswith("Ġ"):
    #                 redacted_text += " "
    #             redacted_text += token

    #     redacted_text = redacted_text.replace("Ġ", "")
    #     redacted_text = redacted_text.replace("č", "")
    #     redacted_text = redacted_text.replace("Ċ", "")
    #     # redacted_text = redacted_text.replace("Ċ", "")
        
        

    #     # Print the redacted text
    #     print("Redacted Text:", redacted_text.strip())
    #     return redacted_text.strip()


    def textner(text):
        # Load the model and tokenizer from the local directory
        local_model_directory = "privacy/util/code_detect/ner/pii_inference/nermodel"
        model = AutoModelForTokenClassification.from_pretrained(local_model_directory)
        tokenizer = AutoTokenizer.from_pretrained(local_model_directory)
        print("textNER", text)

        # Prepare input text
        inputs = tokenizer(text, return_tensors="pt")

        # Perform inference
        with torch.no_grad():
            outputs = model(**inputs)

        # Get the predicted labels
        predicted_labels = torch.argmax(outputs.logits, dim=2)
        labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()]
        print(predicted_labels,"predicted_labels")
        print("labels",labels)
        # Define a mapping of entity types to placeholders
        entity_mapping = {
                "<USERNAME>": "<USERNAME>",
                "<EMAIL>": "<EMAIL>",
                "<IP_ADDRESS>": "<IP_ADDRESS>",
                "<KEY>": "<KEY>",
            }

       # Initialize variables
        redacted_text = ""
        current_entity = None
        last_token_was_special = False

        # Redact entities in the original text
        for token, label in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), labels):
            if token.startswith("Ġ"):
                last_token_was_special = True
                token = token[1:]  # Remove the leading "Ġ" character
            else:
                last_token_was_special = False

            if label.startswith("B-"):
                current_entity = label[2:]
                redacted_text += f"<{entity_mapping.get(current_entity, current_entity)}>"
            elif label.startswith("I-") and current_entity is not None:
                pass  # Skip intermediate tokens of the entity
            else:
                current_entity = None
                if last_token_was_special and not token.startswith("Ġ"):
                    redacted_text += " "
                redacted_text += token

        # Print the redacted text
        #code_detect_ner.filener("privacy/util/code_detect/ner/pii_inference/input_code.java")
        redacted_text = redacted_text.replace("Ġ", "")
        redacted_text = redacted_text.replace("č", "")
        redacted_text = redacted_text.replace("Ċ", "")
        print("Redacted Text:", redacted_text.strip())
        return redacted_text

    # def filener(input_code_file):
    #     ## NER DETECTION FROM FILE BUT FOR BIG CODE!!!!!!!!!!!!!!
    #     from transformers import AutoModelForTokenClassification, AutoTokenizer
    #     import torch

    #     # Load the model and tokenizer from the local directory
    #     local_model_directory = "privacy/util/code_detect/ner/pii_inference/nermodel"
    #     model = AutoModelForTokenClassification.from_pretrained(local_model_directory)
    #     tokenizer = AutoTokenizer.from_pretrained(local_model_directory, model_max_length=10000)

    #     # Specify the input code file
    #     #input_code_file = "input_code.java"
    #     # input_code_file = "input.py"

    #     # Read the code from the file
    #     with open(input_code_file, "r", encoding="utf-8") as file:
    #         code = file.read()
    #     #code = input_code_file.file.read()

    #     # Define a chunk size (adjust as needed)
    #     chunk_size = 1000

    #     # Initialize the redacted text
    #     redacted_text = ""
    #     current_entity = None
    #     last_token_was_special = False

    #     # Split the code into chunks
    #     code_chunks = [code[i:i + chunk_size] for i in range(0, len(code), chunk_size)]

    #     # Process each chunk
    #     for i, chunk in enumerate(code_chunks):
    #         # Prepare input text
    #         inputs = tokenizer(chunk, return_tensors="pt")

    #         # Perform inference
    #         with torch.no_grad():
    #             outputs = model(**inputs)

    #         # Get the predicted labels
    #         predicted_labels = torch.argmax(outputs.logits, dim=2)
    #         labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()]

    #         # Define a mapping of entity types to placeholders
    #         entity_mapping = {
    #             "NAME": "<NAME>",
    #             "EMAIL": "<EMAIL>",
    #             "IP_ADDRESS": "<IP_ADDRESS>",
    #         }

    #         # Redact entities in the original text
    #         for token, label in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), labels):
    #             if token.startswith("Ġ"):
    #                 last_token_was_special = True
    #                 token = token[1:]  # Remove the leading "Ġ" character
    #             else:
    #                 last_token_was_special = False

    #             # Add space if the last token was a special token and the current token does not start with "<"
    #             if last_token_was_special and not token.startswith("<"):
    #                 redacted_text += " "

    #             if label.startswith("B-"):
    #                 current_entity = label[2:]
    #                 redacted_text += f"{entity_mapping.get(current_entity, current_entity)}"
    #             elif label.startswith("I-") and current_entity is not None:
    #                 pass  # Skip intermediate tokens of the entity
    #             else:
    #                 current_entity = None
    #                 redacted_text += token

    #     # Split the redacted text into lines and add indentation
    #     redacted_lines = redacted_text.split("Ċ")
    #     formatted_redacted_text = ""
    #     indentation = 0

    #     for line in redacted_lines:
    #         if "{" in line:
    #             formatted_redacted_text += "    " * indentation + line + "\n"
    #             indentation += 1
    #         elif "}" in line:
    #             indentation -= 1
    #             formatted_redacted_text += "    " * indentation + line + "\n"
    #         else:
    #             formatted_redacted_text += "    " * indentation + line + "\n"

    #     # Remove any remaining special characters
    #     formatted_redacted_text = formatted_redacted_text.replace("Ġ", "")

    #     # # Write the redacted code back to the file using UTF-8 encoding
    #     # output_code_file = "redacted_code.java"
    #     # with open(output_code_file, "a", encoding="utf-8") as file:
    #     #     file.write(formatted_redacted_text.strip())
    #     # Generate the output file name based on the input file name
    #     output_code_file = os.path.splitext(input_code_file)[0] + "_redacted" + os.path.splitext(input_code_file)[1]

    #     # Write the redacted code back to the file using UTF-8 encoding
    #     with open(output_code_file, "w", encoding="utf-8") as file:
    #         file.write(formatted_redacted_text.strip())
    #     # Delete the temporary input code file
    #     os.remove(input_code_file)
    #     # Print the final redacted text
    #     print("Redacted Text:", formatted_redacted_text.strip())
    #     return output_code_file





    # def filener(code_content, filename):
    #     # Load the model and tokenizer from the local directory
    #     local_model_directory = "privacy/util/code_detect/ner/pii_inference/nermodel"
    #     model = AutoModelForTokenClassification.from_pretrained(local_model_directory)
    #     tokenizer = AutoTokenizer.from_pretrained(local_model_directory, model_max_length=10000)

    #     # Define a chunk size (adjust as needed)
    #     chunk_size = 1000

    #     # Initialize the redacted text
    #     redacted_text = ""
    #     current_entity = None
    #     last_token_was_special = False

    #     # Split the code into chunks
    #     code_chunks = [code_content[i:i + chunk_size] for i in range(0, len(code_content), chunk_size)]

    #     # Process each chunk
    #     for i, chunk in enumerate(code_chunks):
    #         # Prepare input text
    #         chunk_str = chunk.decode("utf-8")
    #         inputs = tokenizer(chunk_str, return_tensors="pt")

    #         # Perform inference
    #         with torch.no_grad():
    #             outputs = model(**inputs)

    #         # Get the predicted labels
    #         predicted_labels = torch.argmax(outputs.logits, dim=2)
    #         labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()]

    #         # Define a mapping of entity types to placeholders
    #         entity_mapping = {
    #             "NAME": "<NAME>",
    #             "EMAIL": "<EMAIL>",
    #             "IP_ADDRESS": "<IP_ADDRESS>",
    #         }

    #         # Redact entities in the original text
    #         for token, label in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), labels):
    #             if token.startswith("Ġ"):
    #                 last_token_was_special = True
    #                 token = token[1:]  # Remove the leading "Ġ" character
    #             else:
    #                 last_token_was_special = False

    #             # Add space if the last token was a special token and the current token does not start with "<"
    #             if last_token_was_special and not token.startswith("<"):
    #                 redacted_text += " "

    #             if label.startswith("B-"):
    #                 current_entity = label[2:]
    #                 redacted_text += f"{entity_mapping.get(current_entity, current_entity)}"
    #             elif label.startswith("I-") and current_entity is not None:
    #                 pass  # Skip intermediate tokens of the entity
    #             else:
    #                 current_entity = None
    #                 redacted_text += token

    #     # Split the redacted text into lines and add indentation
    #     redacted_lines = redacted_text.split("Ċ")
    #     formatted_redacted_text = ""
    #     indentation = 0
        
    #     for line in redacted_lines:
    #         if "{" in line:
    #             formatted_redacted_text += "    " * indentation + line + "\n"
    #             indentation += 1
    #         elif "}" in line:
    #             indentation -= 1
    #             formatted_redacted_text += "    " * indentation + line + "\n"
    #         else:
    #             formatted_redacted_text += "    " * indentation + line + "\n"

    #     # Remove any remaining special characters
    #     formatted_redacted_text = formatted_redacted_text.replace("Ġ", "")
    #     formatted_redacted_text = formatted_redacted_text.replace("č", "")
    #     print("formatted_redacted_text",formatted_redacted_text)
    #     # Generate the output file name based on the input file name
    #     output_code_file = os.path.splitext(filename)[0] + "_redacted" + os.path.splitext(filename)[1]

    #     # Write the redacted code back to the file using UTF-8 encoding
    #     with open(output_code_file, "w", encoding="utf-8") as file:
    #         file.write(formatted_redacted_text.strip())

    #     # Return the redacted text and the output code file name
    #     return formatted_redacted_text.strip().encode("utf-8"), output_code_file
    

    

    # def filener(code_content, filename):
    #     # Load the model and tokenizer from the local directory
    #     local_model_directory = "privacy/util/code_detect/ner/pii_inference/nermodel"
    #     model = AutoModelForTokenClassification.from_pretrained(local_model_directory)
    #     tokenizer = AutoTokenizer.from_pretrained(local_model_directory, model_max_length=10000)

    #     # Define a chunk size (adjust as needed)
    #     chunk_size = 1000

    #     # Initialize the redacted text
    #     redacted_text = ""
    #     current_entity = None
    #     last_token_was_special = False

    #     # Split the code into chunks
    #     code_chunks = [code_content[i:i + chunk_size] for i in range(0, len(code_content), chunk_size)]

    #     # Process each chunk
    #     for i, chunk in enumerate(code_chunks):
    #         # Prepare input text
    #         chunk_str = chunk.decode("utf-8")
    #         inputs = tokenizer(chunk_str, return_tensors="pt")

    #         # Perform inference
    #         with torch.no_grad():
    #             outputs = model(**inputs)

    #         # Get the predicted labels
    #         predicted_labels = torch.argmax(outputs.logits, dim=2)
    #         labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()]

    #         # Define a mapping of entity types to placeholders
    #         entity_mapping = {
    #             "NAME": "<NAME>",
    #             "EMAIL": "<EMAIL>",
    #             "IP_ADDRESS": "<IP_ADDRESS>"
    #         }

    #         # Redact entities in the original text
    #         for token, label in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), labels):
    #             if token.startswith("Ġ"):
    #                 last_token_was_special = True
    #                 token = token[1:]  # Remove the leading "Ġ" character
    #             else:
    #                 last_token_was_special = False

    #             # Add space if the last token was a special token and the current token does not start with "<"
    #             if last_token_was_special and not token.startswith("<"):
    #                 redacted_text += " "

    #             if label.startswith("B-"):
    #                 current_entity = label[2:]
    #                 redacted_text += f"{entity_mapping.get(current_entity, current_entity)}"
    #             elif label.startswith("I-") and current_entity is not None:
    #                 pass  # Skip intermediate tokens of the entity
    #             else:
    #                 current_entity = None
    #                 redacted_text += token

    #     # Split the redacted text into lines and add indentation
    #     redacted_lines = redacted_text.split("Ċ")
    #     formatted_redacted_text = ""
    #     indentation = 0

    #     for line in redacted_lines:
    #         line = line.strip()

    #         if line.startswith(" "):
    #             formatted_line = "    " * indentation + line + "\n"
    #         elif line.startswith("#"):
    #             formatted_line = "    " * indentation + line + "\n"
    #         else:
    #             formatted_line = line + "\n"

    #         # Adjust indentation based on braces
    #         if "{" in line:
    #             indentation += 1
    #         elif "}" in line:
    #             indentation = max(0, indentation - 1)

    #         formatted_redacted_text += formatted_line

    #     # Remove any remaining special characters
    #     formatted_redacted_text = formatted_redacted_text.replace("Ġ", "")
    #     formatted_redacted_text = formatted_redacted_text.replace("č", "")

    #     # Generate the output file name based on the input file name
    #     output_code_file = os.path.splitext(filename)[0] + "_redacted" + os.path.splitext(filename)[1]

    #     # Write the formatted redacted code back to the file using UTF-8 encoding
    #     with open(output_code_file, "w", encoding="utf-8") as file:
    #         file.write(formatted_redacted_text.strip())

    #     # Use autopep8 to format the code in-place
    #     with open(output_code_file, "r", encoding="utf-8") as file:
    #         code_content = file.read()

    #     formatted_code = autopep8.fix_code(
    #         code_content,
    #         options={
    #             'aggressive': 1,
    #             'max_line_length': 120,  # Adjust this based on your desired line length
    #         }
    #     )

    #     # Write the formatted code back
    #     with open(output_code_file, "w", encoding="utf-8") as file:
    #         file.write(formatted_code)
            
    #     print("FORMCODE","\n",formatted_code)
    #     # Return the redacted text and the output code file name
    #     return formatted_code.encode("utf-8"), output_code_file
    
    def filener(code_content, filename,model,tokenizer):
        # Load the model and tokenizer from the local directory
        # local_model_directory = "privacy/util/code_detect/ner/pii_inference/nermodel"
        # model = AutoModelForTokenClassification.from_pretrained(local_model_directory)
        # tokenizer = AutoTokenizer.from_pretrained(local_model_directory, model_max_length=10000)

        # Define a chunk size (adjust as needed)
        chunk_size = 1000
        # Initialize the redacted text
        redacted_text = ""
        current_entity = None
        last_token_was_special = False

        # Split the code into chunks
        code_chunks = [code_content[i:i + chunk_size] for i in range(0, len(code_content), chunk_size)]

        # Process each chunk
        for i, chunk in enumerate(code_chunks):
            # Prepare input text
            chunk_str = chunk.decode("utf-8")
            inputs = tokenizer(chunk_str, return_tensors="pt")

            # Perform inference
            with torch.no_grad():
                outputs = model(**inputs)

            # Get the predicted labels
            predicted_labels = torch.argmax(outputs.logits, dim=2)
            labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()]
            # Define a mapping of entity types to placeholders
            entity_mapping = {
                "USERNAME": "<USERNAME>",
                "EMAIL": "<EMAIL>",
                "IP_ADDRESS": "<IP_ADDRESS>",
                "KEY": "<KEY>",
                "NAME": "<NAME>"
            }

            # Redact entities in the original text
            for token, label in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), labels):
                if token.startswith("Ġ"):
                    last_token_was_special = True
                    token = token[1:]  # Remove the leading "Ġ" character
                else:
                    last_token_was_special = False

                # Add space if the last token was a special token and the current token does not start with "<"
                if last_token_was_special and not token.startswith("<"):
                    redacted_text += " "

                if label.startswith("B-"):
                    current_entity = label[2:]
                    redacted_text += f"{entity_mapping.get(current_entity, current_entity)}"
                elif label.startswith("I-") and current_entity is not None:
                    pass  # Skip intermediate tokens of the entity
                else:
                    current_entity = None
                    redacted_text += token

        # Split the redacted text into lines and add indentation
        redacted_lines = redacted_text.split("Ċ")
        formatted_redacted_text = ""
        indentation = 0

        for line in redacted_lines:
            print("line--",line +"\n")
            line = line.strip()

            if line.startswith(" "):
                formatted_line = "  " * indentation + line + "\n"
            elif line.startswith('Ġ'):
                formatted_line = " " + line + "\n"
            elif line.startswith('ĉ'):
                formatted_line = "        " + line + "\n"
            elif line.startswith("#"):
                formatted_line = "    " * indentation + line + "\n"    
            else:
                formatted_line = " " + line + "\n"
            print("--formatted line--",formatted_line)
            # Adjust indentation based on braces
            if "{" in line:
                indentation += 1
            elif "}" in line:
                indentation = max(0, indentation - 1)

            # Check if the line ends with a colon, indicating the start of a block
            if line.endswith(":"):
                indentation += 1

            formatted_redacted_text += formatted_line
        # Remove any remaining special characters
        
        formatted_redacted_text = formatted_redacted_text.replace("Ġ", " ")
        print("to be removed chars--",formatted_redacted_text)

        formatted_redacted_text = formatted_redacted_text.replace("č", " ")
        formatted_redacted_text = formatted_redacted_text.replace("ĉ", " ") 
        redacted_text = formatted_redacted_text.replace("Ċ", " ")
        #print("formatted text",formatted_redacted_text)
        # Generate the output file name based on the input file name
        output_code_file = os.path.splitext(filename)[0] + "_redacted" + os.path.splitext(filename)[1]

        # Write the formatted redacted code back to the file using UTF-8 encoding
        with open(output_code_file, "w", encoding="utf-8") as file:
            file.write(formatted_redacted_text.strip())

        # Use autopep8 to format the code in-place
        with open(output_code_file, "r", encoding="utf-8") as file:
            code_content = file.read()

        formatted_code = autopep8.fix_code(
            code_content,
            options={
                'aggressive': 1,
                'max_line_length': 120,  # Adjust this based on your desired line length
            }
        )

        # Write the formatted code back
        with open(output_code_file, "w", encoding="utf-8") as file:
            file.write(formatted_code)
        # print("FORMATTED CODE","\n", formatted_code)
        # Return the redacted text and the output code file name
        return formatted_code.encode("utf-8"), output_code_file




## FOR FILE WORKING
# from transformers import AutoModelForTokenClassification, AutoTokenizer
# import torch

# # Load the model and tokenizer from the local directory
# local_model_directory = "./nermodel"
# model = AutoModelForTokenClassification.from_pretrained(local_model_directory)
# tokenizer = AutoTokenizer.from_pretrained(local_model_directory,model_max_length=10000)

# # Specify the input code file
# input_code_file = "input_code.java"
# #input_code_file = "input.py"
# # Read the code from the file
# with open(input_code_file, "r", encoding="utf-8") as file:
#     code = file.read()

# # Prepare input text
# inputs = tokenizer(code, return_tensors="pt")
# # print("INPUT IDS",inputs["input_ids"].shape)
# # print("MODEL CONFIG",model.config)
# # print("TOKENIZER",tokenizer)
# # Perform inference
# with torch.no_grad():
#     outputs = model(**inputs)

# # Get the predicted labels
# predicted_labels = torch.argmax(outputs.logits, dim=2)
# labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()]

# # Define a mapping of entity types to placeholders
# entity_mapping = {
#     "NAME": "<NAME>",
#     "EMAIL": "<EMAIL>",
#     "IP_ADDRESS": "<IP_ADDRESS>",
# }

# # Initialize variables
# redacted_text = ""
# current_entity = None
# last_token_was_special = False

# # Redact entities in the original text
# for token, label in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), labels):
#     if token.startswith("Ġ"):
#         last_token_was_special = True
#         token = token[1:]  # Remove the leading "Ġ" character
#     else:
#         last_token_was_special = False

#     # Add space if the last token was a special token and the current token does not start with "<"
#     if last_token_was_special and not token.startswith("<"):
#         redacted_text += " "

#     if label.startswith("B-"):
#         current_entity = label[2:]
#         redacted_text += f"{entity_mapping.get(current_entity, current_entity)}"
#     elif label.startswith("I-") and current_entity is not None:
#         pass  # Skip intermediate tokens of the entity
#     else:
#         current_entity = None
#         redacted_text += token


# # Split the redacted text into lines and add indentation
# redacted_lines = redacted_text.split("Ċ")
# formatted_redacted_text = ""
# indentation = 0

# for line in redacted_lines:
#     if "{" in line:
#         formatted_redacted_text += "    " * indentation + line + "\n"
#         indentation += 1
#     elif "}" in line:
#         indentation -= 1
#         formatted_redacted_text += "    " * indentation + line + "\n"
#     else:
#         formatted_redacted_text += "    " * indentation + line + "\n"

# # Remove any remaining special characters
# formatted_redacted_text = formatted_redacted_text.replace("Ġ", "")

# # Write the redacted code back to the file using UTF-8 encoding
# output_code_file = "redacted_code.java"
# #output_code_file = "x.py"
# with open(output_code_file, "w", encoding="utf-8") as file:
#     file.write(formatted_redacted_text.strip())

# # Print the redacted text
# print("Redacted Text:", formatted_redacted_text.strip())