# import os # from dataclasses import dataclass, field # from tqdm import tqdm # import pandas as pd # import torch # from torch.utils.data import DataLoader # import datasets # from datasets import load_dataset, Dataset # from accelerate import Accelerator # from transformers import HfArgumentParser # from transformers import AutoModelForTokenClassification, AutoTokenizer # from typing import Optional # from utils import PiiNERPipeline # import time # @dataclass # class PipelineArgs: # model_name: Optional[str] = field(default="./", metadata={"help": "the model name"}) # process_batch_size: int = field(default=10_000, metadata={"help": "files per worker"}) # batch_size: Optional[int] = field(default=1024, metadata={"help": "batch size"}) # dataset: Optional[str] = field(default="./", metadata={"help": "dataset"}) # subset: Optional[str] = field(default="data/python/", metadata={"help": "dataset subdirectory"}) # out_path: Optional[str] = field(default="./results/", metadata={"help": "path for output"}) # email= "abhi@gmail.com" # def main(): # """launch code # >>>> accelerate config # >>>> accelerate launch ner_inference.py --process_batch_size=8 --out_path=processed_dataset # """ # parser = HfArgumentParser(PipelineArgs) # args = parser.parse_args() # accelerator = Accelerator() # out_dir = f"{args.out_path}{args.subset.strip('/').split('/')[-2]}" # if accelerator.is_main_process: # if not os.path.exists(out_dir): # os.mkdir(out_dir) # dataset = load_dataset(args.dataset, data_dir=args.subset, use_auth_token=True, split="train", num_proc=12) # dataset = dataset.map( # lambda example, idx: { # "id": f"{idx}", # "max_stars_count": example["max_stars_count"] if example["max_stars_count"] is not None else 0 # }, # with_indices=True, num_proc=12) # shard_size = (len(dataset))/8 # if shard_size > 1_000_000: # process_batch_size = 200_000 # elif shard_size > 100_000: # process_batch_size = 100_000 # else: # process_batch_size = 10_000 # model = AutoModelForTokenClassification.from_pretrained(args.model_name, use_auth_token=True) # id_to_label = model.config.id2label # tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_auth_token=True) # columns = dataset.column_names # dataset = dataset.remove_columns([col for col in columns if col not in ["content", "id", "max_stars_repo_name", "max_stars_repo_path", "max_stars_count"]]) # dataloader = DataLoader(dataset, batch_size=process_batch_size, shuffle=False, num_workers=4) # model, dataloader = accelerator.prepare(model, dataloader) # pipeline = PiiNERPipeline( # model, # tokenizer=tokenizer, # batch_size=args.batch_size, # window_size=512, # device=accelerator.local_process_index, # num_workers=1, # use_auth_token=True, # id_to_label=id_to_label, # window_overlap=False, # bf16=True # ) # num_samples = 0 # for i, batch in enumerate(tqdm(dataloader)): # # last batches are filled - remove filling # if i==len(dataloader)-1 and int(batch["id"][0])>int(batch["id"][-1]): # for j in range(len(batch["id"])-1): # if int(batch["id"][j])>int(batch["id"][j+1]): # stop_index = j+1 # for key in batch: # batch[key] = batch[key][:stop_index] # result = list(pipeline(datasets.Dataset.from_dict(batch))) # # add original data # for k, element in enumerate(result): # for key in batch: # element[key] = batch[key][k] # processed_dataset = Dataset.from_dict(pd.DataFrame(result)) # processed_dataset.to_parquet(f"{out_dir}/job_{accelerator.process_index}_{i}.parquet") # if __name__ == "__main__": # main() # import torch # from transformers import AutoModelForTokenClassification, AutoTokenizer # from privacy.util.code_detect.ner.ner_inference import PiiNERPipeline # from datasets import Dataset # from privacy.util.code_detect.ner.pii_redaction.utils import get_replacements, redact_pii_batch # def main(): # # Specify the path to your local model and input code file # model_path = "pii_inference/nermodel" # code_file_path = "input_code.java" # # Load the model and tokenizer # model = AutoModelForTokenClassification.from_pretrained(model_path) # tokenizer = AutoTokenizer.from_pretrained(model_path) # # Create the NER pipeline # pipeline = PiiNERPipeline( # model, # tokenizer=tokenizer, # batch_size=1024, # window_size=512, # device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), # num_workers=1, # id_to_label=model.config.id2label, # window_overlap=False, # bf16=True # ) # # Read the input code file # with open(code_file_path, "r") as file: # code = file.read() # # Split the code into sentences # sentences = code.split(". ") # print(sentences, "SENTENCES") # # Create an id list # ids = list(range(len(sentences))) # # Create a Dataset object from the sentences # dataset = Dataset.from_dict({"content": sentences, "id": ids}) # # Process the sentences with the NER pipeline # result = pipeline(dataset) # replacements = get_replacements() # # Convert the generator to a list and print the results # results = list(result) # print(results, "RESULT") # # Redact the PII from the results # redacted_results = redact_pii_batch(results, replacements) # print(redacted_results, "redacted_results") # if __name__ == "__main__": # main()