# from transformers import pipeline | |
# classifier = pipeline("token-classification", model = "bigcode/starpii", aggregation_strategy="simple") | |
# classifier("Hello I'm John and my IP address is 196.780.89.78") | |
# from transformers import AutoModelForTokenClassification, AutoTokenizer | |
# import torch | |
# # Load the pre-trained model and tokenizer | |
# model_name = "bigcode/starpii" | |
# model = AutoModelForTokenClassification.from_pretrained(model_name) | |
# tokenizer = AutoTokenizer.from_pretrained(model_name) | |
# # Prepare input text | |
# text = "from transformers import AutoModelForTokenClassification, AutoTokenizer import torch secretkey= cmVnrGtuOjAxOjE3MjEyODUwMjg6M0RrNjVMVGZEaGd6T0RiZ09FR3M5MEV5Tk0z ipadress= 10.83.73.87.84 email= [email protected]" | |
# inputs = tokenizer(text, return_tensors="pt") | |
# # Perform inference | |
# with torch.no_grad(): | |
# outputs = model(**inputs) | |
# # Get the predicted labels | |
# predicted_labels = torch.argmax(outputs.logits, dim=2) | |
# labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()] | |
# # Print the labels | |
# print(labels) | |
# from transformers import AutoModelForTokenClassification, AutoTokenizer | |
# import torch | |
# # Load the pre-trained model and tokenizer | |
# model_name = "bigcode/starpii" | |
# model = AutoModelForTokenClassification.from_pretrained(model_name) | |
# tokenizer = AutoTokenizer.from_pretrained(model_name) | |
# # Prepare input text | |
# text = "from transformers import AutoModelForTokenClassification, AutoTokenizer import torch secretkey= cmVnrGtuOjAxOjE3MjEyODUwMjg6M0RrNjVMVGZEaGd6T0RiZ09FR3M5MEV5Tk0z ipadress= 10.83.73.87.84 email= [email protected]" | |
# inputs = tokenizer(text, return_tensors="pt") | |
# # Perform inference | |
# with torch.no_grad(): | |
# outputs = model(**inputs) | |
# # Get the predicted labels | |
# predicted_labels = torch.argmax(outputs.logits, dim=2) | |
# labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()] | |
# # Replace IP address with the label or "IP_ADDRESS" | |
# output_text = text | |
# current_ip = "" | |
# for token, label in zip(inputs["input_ids"][0], labels): | |
# token_text = tokenizer.decode(token).strip() | |
# if label == "B-EMAIL": | |
# current_ip += token_text | |
# if label == "I-EMAIL": | |
# current_ip += token_text | |
# elif current_ip: | |
# output_text = output_text.replace(current_ip, "EMAILID") | |
# current_ip = "" | |
# print("output text",output_text) | |
## SAVED THE MODEL LOCALLY USING THIS CODE | |
## USING THIS CODE TEH HUGGINGFACE MODEL IS SAVED LOCALLY AND USED IN BELOW CODE | |
## FOR TEXT AS WELL AS FILE DETECTION | |
# from transformers import AutoModelForTokenClassification, AutoTokenizer | |
# # Load the pre-trained model and tokenizer | |
# model_name = "bigcode/starpii" | |
# model = AutoModelForTokenClassification.from_pretrained(model_name) | |
# tokenizer = AutoTokenizer.from_pretrained(model_name) | |
# # Specify the directory where you want to save the model | |
# local_model_directory = "./nermodel" | |
# # Save the model and tokenizer to the local directory | |
# model.save_pretrained(local_model_directory) | |
# tokenizer.save_pretrained(local_model_directory) | |
# print(f"Model and tokenizer saved to {local_model_directory}") | |
## ABOVE COMMENTED CODE IS FOR REMOVAL!!! | |
# NER MODEL DETECTION FOR TEXT | |
from transformers import AutoModelForTokenClassification, AutoTokenizer | |
import torch | |
import os | |
import autopep8 | |
import re | |
class code_detect_ner: | |
# def textner(text): | |
# # Load the model and tokenizer from the local directory | |
# local_model_directory = "privacy/util/code_detect/ner/pii_inference/nermodel" | |
# model = AutoModelForTokenClassification.from_pretrained(local_model_directory) | |
# tokenizer = AutoTokenizer.from_pretrained(local_model_directory) | |
# # Prepare input text | |
# inputs = tokenizer(text, return_tensors="pt") | |
# # Perform inference | |
# with torch.no_grad(): | |
# outputs = model(**inputs) | |
# # Get the predicted labels | |
# predicted_labels = torch.argmax(outputs.logits, dim=2) | |
# labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()] | |
# # Define a mapping of entity types to placeholders | |
# entity_mapping = { | |
# "USERNAME": "<USERNAME>", | |
# "EMAIL": "<EMAIL>", | |
# "IP_ADDRESS": "<IP_ADDRESS>", | |
# "KEY": "<KEY>", | |
# } | |
# # Initialize variables | |
# redacted_text = "" | |
# current_entity = None | |
# last_token_was_special = False | |
# # Redact entities in the original text | |
# for token, label in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), labels): | |
# if token.startswith("Ġ"): | |
# last_token_was_special = True | |
# token = token[1:] # Remove the leading "Ġ" character if present | |
# else: | |
# last_token_was_special = False | |
# if label.startswith("B-"): | |
# current_entity = label[2:] | |
# redacted_text += f" {entity_mapping.get(current_entity, current_entity)}" | |
# elif label.startswith("I-") and current_entity is not None: | |
# pass # Skip intermediate tokens of the entity | |
# else: | |
# current_entity = None | |
# if last_token_was_special and not token.startswith("Ġ"): | |
# redacted_text += " " | |
# redacted_text += token | |
# redacted_text = redacted_text.replace("Ġ", "") | |
# redacted_text = redacted_text.replace("č", "") | |
# redacted_text = redacted_text.replace("Ċ", "") | |
# # redacted_text = redacted_text.replace("Ċ", "") | |
# # Print the redacted text | |
# print("Redacted Text:", redacted_text.strip()) | |
# return redacted_text.strip() | |
def textner(text): | |
# Load the model and tokenizer from the local directory | |
local_model_directory = "privacy/util/code_detect/ner/pii_inference/nermodel" | |
model = AutoModelForTokenClassification.from_pretrained(local_model_directory) | |
tokenizer = AutoTokenizer.from_pretrained(local_model_directory) | |
print("textNER", text) | |
# Prepare input text | |
inputs = tokenizer(text, return_tensors="pt") | |
# Perform inference | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
# Get the predicted labels | |
predicted_labels = torch.argmax(outputs.logits, dim=2) | |
labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()] | |
print(predicted_labels,"predicted_labels") | |
print("labels",labels) | |
# Define a mapping of entity types to placeholders | |
entity_mapping = { | |
"<USERNAME>": "<USERNAME>", | |
"<EMAIL>": "<EMAIL>", | |
"<IP_ADDRESS>": "<IP_ADDRESS>", | |
"<KEY>": "<KEY>", | |
} | |
# Initialize variables | |
redacted_text = "" | |
current_entity = None | |
last_token_was_special = False | |
# Redact entities in the original text | |
for token, label in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), labels): | |
if token.startswith("Ġ"): | |
last_token_was_special = True | |
token = token[1:] # Remove the leading "Ġ" character | |
else: | |
last_token_was_special = False | |
if label.startswith("B-"): | |
current_entity = label[2:] | |
redacted_text += f"<{entity_mapping.get(current_entity, current_entity)}>" | |
elif label.startswith("I-") and current_entity is not None: | |
pass # Skip intermediate tokens of the entity | |
else: | |
current_entity = None | |
if last_token_was_special and not token.startswith("Ġ"): | |
redacted_text += " " | |
redacted_text += token | |
# Print the redacted text | |
#code_detect_ner.filener("privacy/util/code_detect/ner/pii_inference/input_code.java") | |
redacted_text = redacted_text.replace("Ġ", "") | |
redacted_text = redacted_text.replace("č", "") | |
redacted_text = redacted_text.replace("Ċ", "") | |
print("Redacted Text:", redacted_text.strip()) | |
return redacted_text | |
# def filener(input_code_file): | |
# ## NER DETECTION FROM FILE BUT FOR BIG CODE!!!!!!!!!!!!!! | |
# from transformers import AutoModelForTokenClassification, AutoTokenizer | |
# import torch | |
# # Load the model and tokenizer from the local directory | |
# local_model_directory = "privacy/util/code_detect/ner/pii_inference/nermodel" | |
# model = AutoModelForTokenClassification.from_pretrained(local_model_directory) | |
# tokenizer = AutoTokenizer.from_pretrained(local_model_directory, model_max_length=10000) | |
# # Specify the input code file | |
# #input_code_file = "input_code.java" | |
# # input_code_file = "input.py" | |
# # Read the code from the file | |
# with open(input_code_file, "r", encoding="utf-8") as file: | |
# code = file.read() | |
# #code = input_code_file.file.read() | |
# # Define a chunk size (adjust as needed) | |
# chunk_size = 1000 | |
# # Initialize the redacted text | |
# redacted_text = "" | |
# current_entity = None | |
# last_token_was_special = False | |
# # Split the code into chunks | |
# code_chunks = [code[i:i + chunk_size] for i in range(0, len(code), chunk_size)] | |
# # Process each chunk | |
# for i, chunk in enumerate(code_chunks): | |
# # Prepare input text | |
# inputs = tokenizer(chunk, return_tensors="pt") | |
# # Perform inference | |
# with torch.no_grad(): | |
# outputs = model(**inputs) | |
# # Get the predicted labels | |
# predicted_labels = torch.argmax(outputs.logits, dim=2) | |
# labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()] | |
# # Define a mapping of entity types to placeholders | |
# entity_mapping = { | |
# "NAME": "<NAME>", | |
# "EMAIL": "<EMAIL>", | |
# "IP_ADDRESS": "<IP_ADDRESS>", | |
# } | |
# # Redact entities in the original text | |
# for token, label in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), labels): | |
# if token.startswith("Ġ"): | |
# last_token_was_special = True | |
# token = token[1:] # Remove the leading "Ġ" character | |
# else: | |
# last_token_was_special = False | |
# # Add space if the last token was a special token and the current token does not start with "<" | |
# if last_token_was_special and not token.startswith("<"): | |
# redacted_text += " " | |
# if label.startswith("B-"): | |
# current_entity = label[2:] | |
# redacted_text += f"{entity_mapping.get(current_entity, current_entity)}" | |
# elif label.startswith("I-") and current_entity is not None: | |
# pass # Skip intermediate tokens of the entity | |
# else: | |
# current_entity = None | |
# redacted_text += token | |
# # Split the redacted text into lines and add indentation | |
# redacted_lines = redacted_text.split("Ċ") | |
# formatted_redacted_text = "" | |
# indentation = 0 | |
# for line in redacted_lines: | |
# if "{" in line: | |
# formatted_redacted_text += " " * indentation + line + "\n" | |
# indentation += 1 | |
# elif "}" in line: | |
# indentation -= 1 | |
# formatted_redacted_text += " " * indentation + line + "\n" | |
# else: | |
# formatted_redacted_text += " " * indentation + line + "\n" | |
# # Remove any remaining special characters | |
# formatted_redacted_text = formatted_redacted_text.replace("Ġ", "") | |
# # # Write the redacted code back to the file using UTF-8 encoding | |
# # output_code_file = "redacted_code.java" | |
# # with open(output_code_file, "a", encoding="utf-8") as file: | |
# # file.write(formatted_redacted_text.strip()) | |
# # Generate the output file name based on the input file name | |
# output_code_file = os.path.splitext(input_code_file)[0] + "_redacted" + os.path.splitext(input_code_file)[1] | |
# # Write the redacted code back to the file using UTF-8 encoding | |
# with open(output_code_file, "w", encoding="utf-8") as file: | |
# file.write(formatted_redacted_text.strip()) | |
# # Delete the temporary input code file | |
# os.remove(input_code_file) | |
# # Print the final redacted text | |
# print("Redacted Text:", formatted_redacted_text.strip()) | |
# return output_code_file | |
# def filener(code_content, filename): | |
# # Load the model and tokenizer from the local directory | |
# local_model_directory = "privacy/util/code_detect/ner/pii_inference/nermodel" | |
# model = AutoModelForTokenClassification.from_pretrained(local_model_directory) | |
# tokenizer = AutoTokenizer.from_pretrained(local_model_directory, model_max_length=10000) | |
# # Define a chunk size (adjust as needed) | |
# chunk_size = 1000 | |
# # Initialize the redacted text | |
# redacted_text = "" | |
# current_entity = None | |
# last_token_was_special = False | |
# # Split the code into chunks | |
# code_chunks = [code_content[i:i + chunk_size] for i in range(0, len(code_content), chunk_size)] | |
# # Process each chunk | |
# for i, chunk in enumerate(code_chunks): | |
# # Prepare input text | |
# chunk_str = chunk.decode("utf-8") | |
# inputs = tokenizer(chunk_str, return_tensors="pt") | |
# # Perform inference | |
# with torch.no_grad(): | |
# outputs = model(**inputs) | |
# # Get the predicted labels | |
# predicted_labels = torch.argmax(outputs.logits, dim=2) | |
# labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()] | |
# # Define a mapping of entity types to placeholders | |
# entity_mapping = { | |
# "NAME": "<NAME>", | |
# "EMAIL": "<EMAIL>", | |
# "IP_ADDRESS": "<IP_ADDRESS>", | |
# } | |
# # Redact entities in the original text | |
# for token, label in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), labels): | |
# if token.startswith("Ġ"): | |
# last_token_was_special = True | |
# token = token[1:] # Remove the leading "Ġ" character | |
# else: | |
# last_token_was_special = False | |
# # Add space if the last token was a special token and the current token does not start with "<" | |
# if last_token_was_special and not token.startswith("<"): | |
# redacted_text += " " | |
# if label.startswith("B-"): | |
# current_entity = label[2:] | |
# redacted_text += f"{entity_mapping.get(current_entity, current_entity)}" | |
# elif label.startswith("I-") and current_entity is not None: | |
# pass # Skip intermediate tokens of the entity | |
# else: | |
# current_entity = None | |
# redacted_text += token | |
# # Split the redacted text into lines and add indentation | |
# redacted_lines = redacted_text.split("Ċ") | |
# formatted_redacted_text = "" | |
# indentation = 0 | |
# for line in redacted_lines: | |
# if "{" in line: | |
# formatted_redacted_text += " " * indentation + line + "\n" | |
# indentation += 1 | |
# elif "}" in line: | |
# indentation -= 1 | |
# formatted_redacted_text += " " * indentation + line + "\n" | |
# else: | |
# formatted_redacted_text += " " * indentation + line + "\n" | |
# # Remove any remaining special characters | |
# formatted_redacted_text = formatted_redacted_text.replace("Ġ", "") | |
# formatted_redacted_text = formatted_redacted_text.replace("č", "") | |
# print("formatted_redacted_text",formatted_redacted_text) | |
# # Generate the output file name based on the input file name | |
# output_code_file = os.path.splitext(filename)[0] + "_redacted" + os.path.splitext(filename)[1] | |
# # Write the redacted code back to the file using UTF-8 encoding | |
# with open(output_code_file, "w", encoding="utf-8") as file: | |
# file.write(formatted_redacted_text.strip()) | |
# # Return the redacted text and the output code file name | |
# return formatted_redacted_text.strip().encode("utf-8"), output_code_file | |
# def filener(code_content, filename): | |
# # Load the model and tokenizer from the local directory | |
# local_model_directory = "privacy/util/code_detect/ner/pii_inference/nermodel" | |
# model = AutoModelForTokenClassification.from_pretrained(local_model_directory) | |
# tokenizer = AutoTokenizer.from_pretrained(local_model_directory, model_max_length=10000) | |
# # Define a chunk size (adjust as needed) | |
# chunk_size = 1000 | |
# # Initialize the redacted text | |
# redacted_text = "" | |
# current_entity = None | |
# last_token_was_special = False | |
# # Split the code into chunks | |
# code_chunks = [code_content[i:i + chunk_size] for i in range(0, len(code_content), chunk_size)] | |
# # Process each chunk | |
# for i, chunk in enumerate(code_chunks): | |
# # Prepare input text | |
# chunk_str = chunk.decode("utf-8") | |
# inputs = tokenizer(chunk_str, return_tensors="pt") | |
# # Perform inference | |
# with torch.no_grad(): | |
# outputs = model(**inputs) | |
# # Get the predicted labels | |
# predicted_labels = torch.argmax(outputs.logits, dim=2) | |
# labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()] | |
# # Define a mapping of entity types to placeholders | |
# entity_mapping = { | |
# "NAME": "<NAME>", | |
# "EMAIL": "<EMAIL>", | |
# "IP_ADDRESS": "<IP_ADDRESS>" | |
# } | |
# # Redact entities in the original text | |
# for token, label in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), labels): | |
# if token.startswith("Ġ"): | |
# last_token_was_special = True | |
# token = token[1:] # Remove the leading "Ġ" character | |
# else: | |
# last_token_was_special = False | |
# # Add space if the last token was a special token and the current token does not start with "<" | |
# if last_token_was_special and not token.startswith("<"): | |
# redacted_text += " " | |
# if label.startswith("B-"): | |
# current_entity = label[2:] | |
# redacted_text += f"{entity_mapping.get(current_entity, current_entity)}" | |
# elif label.startswith("I-") and current_entity is not None: | |
# pass # Skip intermediate tokens of the entity | |
# else: | |
# current_entity = None | |
# redacted_text += token | |
# # Split the redacted text into lines and add indentation | |
# redacted_lines = redacted_text.split("Ċ") | |
# formatted_redacted_text = "" | |
# indentation = 0 | |
# for line in redacted_lines: | |
# line = line.strip() | |
# if line.startswith(" "): | |
# formatted_line = " " * indentation + line + "\n" | |
# elif line.startswith("#"): | |
# formatted_line = " " * indentation + line + "\n" | |
# else: | |
# formatted_line = line + "\n" | |
# # Adjust indentation based on braces | |
# if "{" in line: | |
# indentation += 1 | |
# elif "}" in line: | |
# indentation = max(0, indentation - 1) | |
# formatted_redacted_text += formatted_line | |
# # Remove any remaining special characters | |
# formatted_redacted_text = formatted_redacted_text.replace("Ġ", "") | |
# formatted_redacted_text = formatted_redacted_text.replace("č", "") | |
# # Generate the output file name based on the input file name | |
# output_code_file = os.path.splitext(filename)[0] + "_redacted" + os.path.splitext(filename)[1] | |
# # Write the formatted redacted code back to the file using UTF-8 encoding | |
# with open(output_code_file, "w", encoding="utf-8") as file: | |
# file.write(formatted_redacted_text.strip()) | |
# # Use autopep8 to format the code in-place | |
# with open(output_code_file, "r", encoding="utf-8") as file: | |
# code_content = file.read() | |
# formatted_code = autopep8.fix_code( | |
# code_content, | |
# options={ | |
# 'aggressive': 1, | |
# 'max_line_length': 120, # Adjust this based on your desired line length | |
# } | |
# ) | |
# # Write the formatted code back | |
# with open(output_code_file, "w", encoding="utf-8") as file: | |
# file.write(formatted_code) | |
# print("FORMCODE","\n",formatted_code) | |
# # Return the redacted text and the output code file name | |
# return formatted_code.encode("utf-8"), output_code_file | |
def filener(code_content, filename,model,tokenizer): | |
# Load the model and tokenizer from the local directory | |
# local_model_directory = "privacy/util/code_detect/ner/pii_inference/nermodel" | |
# model = AutoModelForTokenClassification.from_pretrained(local_model_directory) | |
# tokenizer = AutoTokenizer.from_pretrained(local_model_directory, model_max_length=10000) | |
# Define a chunk size (adjust as needed) | |
chunk_size = 1000 | |
# Initialize the redacted text | |
redacted_text = "" | |
current_entity = None | |
last_token_was_special = False | |
# Split the code into chunks | |
code_chunks = [code_content[i:i + chunk_size] for i in range(0, len(code_content), chunk_size)] | |
# Process each chunk | |
for i, chunk in enumerate(code_chunks): | |
# Prepare input text | |
chunk_str = chunk.decode("utf-8") | |
inputs = tokenizer(chunk_str, return_tensors="pt") | |
# Perform inference | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
# Get the predicted labels | |
predicted_labels = torch.argmax(outputs.logits, dim=2) | |
labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()] | |
# Define a mapping of entity types to placeholders | |
entity_mapping = { | |
"USERNAME": "<USERNAME>", | |
"EMAIL": "<EMAIL>", | |
"IP_ADDRESS": "<IP_ADDRESS>", | |
"KEY": "<KEY>", | |
"NAME": "<NAME>" | |
} | |
# Redact entities in the original text | |
for token, label in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), labels): | |
if token.startswith("Ġ"): | |
last_token_was_special = True | |
token = token[1:] # Remove the leading "Ġ" character | |
else: | |
last_token_was_special = False | |
# Add space if the last token was a special token and the current token does not start with "<" | |
if last_token_was_special and not token.startswith("<"): | |
redacted_text += " " | |
if label.startswith("B-"): | |
current_entity = label[2:] | |
redacted_text += f"{entity_mapping.get(current_entity, current_entity)}" | |
elif label.startswith("I-") and current_entity is not None: | |
pass # Skip intermediate tokens of the entity | |
else: | |
current_entity = None | |
redacted_text += token | |
# Split the redacted text into lines and add indentation | |
redacted_lines = redacted_text.split("Ċ") | |
formatted_redacted_text = "" | |
indentation = 0 | |
for line in redacted_lines: | |
print("line--",line +"\n") | |
line = line.strip() | |
if line.startswith(" "): | |
formatted_line = " " * indentation + line + "\n" | |
elif line.startswith('Ġ'): | |
formatted_line = " " + line + "\n" | |
elif line.startswith('ĉ'): | |
formatted_line = " " + line + "\n" | |
elif line.startswith("#"): | |
formatted_line = " " * indentation + line + "\n" | |
else: | |
formatted_line = " " + line + "\n" | |
print("--formatted line--",formatted_line) | |
# Adjust indentation based on braces | |
if "{" in line: | |
indentation += 1 | |
elif "}" in line: | |
indentation = max(0, indentation - 1) | |
# Check if the line ends with a colon, indicating the start of a block | |
if line.endswith(":"): | |
indentation += 1 | |
formatted_redacted_text += formatted_line | |
# Remove any remaining special characters | |
formatted_redacted_text = formatted_redacted_text.replace("Ġ", " ") | |
print("to be removed chars--",formatted_redacted_text) | |
formatted_redacted_text = formatted_redacted_text.replace("č", " ") | |
formatted_redacted_text = formatted_redacted_text.replace("ĉ", " ") | |
redacted_text = formatted_redacted_text.replace("Ċ", " ") | |
#print("formatted text",formatted_redacted_text) | |
# Generate the output file name based on the input file name | |
output_code_file = os.path.splitext(filename)[0] + "_redacted" + os.path.splitext(filename)[1] | |
# Write the formatted redacted code back to the file using UTF-8 encoding | |
with open(output_code_file, "w", encoding="utf-8") as file: | |
file.write(formatted_redacted_text.strip()) | |
# Use autopep8 to format the code in-place | |
with open(output_code_file, "r", encoding="utf-8") as file: | |
code_content = file.read() | |
formatted_code = autopep8.fix_code( | |
code_content, | |
options={ | |
'aggressive': 1, | |
'max_line_length': 120, # Adjust this based on your desired line length | |
} | |
) | |
# Write the formatted code back | |
with open(output_code_file, "w", encoding="utf-8") as file: | |
file.write(formatted_code) | |
# print("FORMATTED CODE","\n", formatted_code) | |
# Return the redacted text and the output code file name | |
return formatted_code.encode("utf-8"), output_code_file | |
## FOR FILE WORKING | |
# from transformers import AutoModelForTokenClassification, AutoTokenizer | |
# import torch | |
# # Load the model and tokenizer from the local directory | |
# local_model_directory = "./nermodel" | |
# model = AutoModelForTokenClassification.from_pretrained(local_model_directory) | |
# tokenizer = AutoTokenizer.from_pretrained(local_model_directory,model_max_length=10000) | |
# # Specify the input code file | |
# input_code_file = "input_code.java" | |
# #input_code_file = "input.py" | |
# # Read the code from the file | |
# with open(input_code_file, "r", encoding="utf-8") as file: | |
# code = file.read() | |
# # Prepare input text | |
# inputs = tokenizer(code, return_tensors="pt") | |
# # print("INPUT IDS",inputs["input_ids"].shape) | |
# # print("MODEL CONFIG",model.config) | |
# # print("TOKENIZER",tokenizer) | |
# # Perform inference | |
# with torch.no_grad(): | |
# outputs = model(**inputs) | |
# # Get the predicted labels | |
# predicted_labels = torch.argmax(outputs.logits, dim=2) | |
# labels = [model.config.id2label[label_id] for label_id in predicted_labels[0].tolist()] | |
# # Define a mapping of entity types to placeholders | |
# entity_mapping = { | |
# "NAME": "<NAME>", | |
# "EMAIL": "<EMAIL>", | |
# "IP_ADDRESS": "<IP_ADDRESS>", | |
# } | |
# # Initialize variables | |
# redacted_text = "" | |
# current_entity = None | |
# last_token_was_special = False | |
# # Redact entities in the original text | |
# for token, label in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), labels): | |
# if token.startswith("Ġ"): | |
# last_token_was_special = True | |
# token = token[1:] # Remove the leading "Ġ" character | |
# else: | |
# last_token_was_special = False | |
# # Add space if the last token was a special token and the current token does not start with "<" | |
# if last_token_was_special and not token.startswith("<"): | |
# redacted_text += " " | |
# if label.startswith("B-"): | |
# current_entity = label[2:] | |
# redacted_text += f"{entity_mapping.get(current_entity, current_entity)}" | |
# elif label.startswith("I-") and current_entity is not None: | |
# pass # Skip intermediate tokens of the entity | |
# else: | |
# current_entity = None | |
# redacted_text += token | |
# # Split the redacted text into lines and add indentation | |
# redacted_lines = redacted_text.split("Ċ") | |
# formatted_redacted_text = "" | |
# indentation = 0 | |
# for line in redacted_lines: | |
# if "{" in line: | |
# formatted_redacted_text += " " * indentation + line + "\n" | |
# indentation += 1 | |
# elif "}" in line: | |
# indentation -= 1 | |
# formatted_redacted_text += " " * indentation + line + "\n" | |
# else: | |
# formatted_redacted_text += " " * indentation + line + "\n" | |
# # Remove any remaining special characters | |
# formatted_redacted_text = formatted_redacted_text.replace("Ġ", "") | |
# # Write the redacted code back to the file using UTF-8 encoding | |
# output_code_file = "redacted_code.java" | |
# #output_code_file = "x.py" | |
# with open(output_code_file, "w", encoding="utf-8") as file: | |
# file.write(formatted_redacted_text.strip()) | |
# # Print the redacted text | |
# print("Redacted Text:", formatted_redacted_text.strip()) | |