# import os | |
# from dataclasses import dataclass, field | |
# from tqdm import tqdm | |
# import pandas as pd | |
# import torch | |
# from torch.utils.data import DataLoader | |
# import datasets | |
# from datasets import load_dataset, Dataset | |
# from accelerate import Accelerator | |
# from transformers import HfArgumentParser | |
# from transformers import AutoModelForTokenClassification, AutoTokenizer | |
# from typing import Optional | |
# from utils import PiiNERPipeline | |
# import time | |
# @dataclass | |
# class PipelineArgs: | |
# model_name: Optional[str] = field(default="./", metadata={"help": "the model name"}) | |
# process_batch_size: int = field(default=10_000, metadata={"help": "files per worker"}) | |
# batch_size: Optional[int] = field(default=1024, metadata={"help": "batch size"}) | |
# dataset: Optional[str] = field(default="./", metadata={"help": "dataset"}) | |
# subset: Optional[str] = field(default="data/python/", metadata={"help": "dataset subdirectory"}) | |
# out_path: Optional[str] = field(default="./results/", metadata={"help": "path for output"}) | |
# email= "[email protected]" | |
# def main(): | |
# """launch code | |
# >>>> accelerate config | |
# >>>> accelerate launch ner_inference.py --process_batch_size=8 --out_path=processed_dataset | |
# """ | |
# parser = HfArgumentParser(PipelineArgs) | |
# args = parser.parse_args() | |
# accelerator = Accelerator() | |
# out_dir = f"{args.out_path}{args.subset.strip('/').split('/')[-2]}" | |
# if accelerator.is_main_process: | |
# if not os.path.exists(out_dir): | |
# os.mkdir(out_dir) | |
# dataset = load_dataset(args.dataset, data_dir=args.subset, use_auth_token=True, split="train", num_proc=12) | |
# dataset = dataset.map( | |
# lambda example, idx: { | |
# "id": f"{idx}", | |
# "max_stars_count": example["max_stars_count"] if example["max_stars_count"] is not None else 0 | |
# }, | |
# with_indices=True, num_proc=12) | |
# shard_size = (len(dataset))/8 | |
# if shard_size > 1_000_000: | |
# process_batch_size = 200_000 | |
# elif shard_size > 100_000: | |
# process_batch_size = 100_000 | |
# else: | |
# process_batch_size = 10_000 | |
# model = AutoModelForTokenClassification.from_pretrained(args.model_name, use_auth_token=True) | |
# id_to_label = model.config.id2label | |
# tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_auth_token=True) | |
# columns = dataset.column_names | |
# dataset = dataset.remove_columns([col for col in columns if col not in ["content", "id", "max_stars_repo_name", "max_stars_repo_path", "max_stars_count"]]) | |
# dataloader = DataLoader(dataset, batch_size=process_batch_size, shuffle=False, num_workers=4) | |
# model, dataloader = accelerator.prepare(model, dataloader) | |
# pipeline = PiiNERPipeline( | |
# model, | |
# tokenizer=tokenizer, | |
# batch_size=args.batch_size, | |
# window_size=512, | |
# device=accelerator.local_process_index, | |
# num_workers=1, | |
# use_auth_token=True, | |
# id_to_label=id_to_label, | |
# window_overlap=False, | |
# bf16=True | |
# ) | |
# num_samples = 0 | |
# for i, batch in enumerate(tqdm(dataloader)): | |
# # last batches are filled - remove filling | |
# if i==len(dataloader)-1 and int(batch["id"][0])>int(batch["id"][-1]): | |
# for j in range(len(batch["id"])-1): | |
# if int(batch["id"][j])>int(batch["id"][j+1]): | |
# stop_index = j+1 | |
# for key in batch: | |
# batch[key] = batch[key][:stop_index] | |
# result = list(pipeline(datasets.Dataset.from_dict(batch))) | |
# # add original data | |
# for k, element in enumerate(result): | |
# for key in batch: | |
# element[key] = batch[key][k] | |
# processed_dataset = Dataset.from_dict(pd.DataFrame(result)) | |
# processed_dataset.to_parquet(f"{out_dir}/job_{accelerator.process_index}_{i}.parquet") | |
# if __name__ == "__main__": | |
# main() | |
# import torch | |
# from transformers import AutoModelForTokenClassification, AutoTokenizer | |
# from privacy.util.code_detect.ner.ner_inference import PiiNERPipeline | |
# from datasets import Dataset | |
# from privacy.util.code_detect.ner.pii_redaction.utils import get_replacements, redact_pii_batch | |
# def main(): | |
# # Specify the path to your local model and input code file | |
# model_path = "pii_inference/nermodel" | |
# code_file_path = "input_code.java" | |
# # Load the model and tokenizer | |
# model = AutoModelForTokenClassification.from_pretrained(model_path) | |
# tokenizer = AutoTokenizer.from_pretrained(model_path) | |
# # Create the NER pipeline | |
# pipeline = PiiNERPipeline( | |
# model, | |
# tokenizer=tokenizer, | |
# batch_size=1024, | |
# window_size=512, | |
# device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), | |
# num_workers=1, | |
# id_to_label=model.config.id2label, | |
# window_overlap=False, | |
# bf16=True | |
# ) | |
# # Read the input code file | |
# with open(code_file_path, "r") as file: | |
# code = file.read() | |
# # Split the code into sentences | |
# sentences = code.split(". ") | |
# print(sentences, "SENTENCES") | |
# # Create an id list | |
# ids = list(range(len(sentences))) | |
# # Create a Dataset object from the sentences | |
# dataset = Dataset.from_dict({"content": sentences, "id": ids}) | |
# # Process the sentences with the NER pipeline | |
# result = pipeline(dataset) | |
# replacements = get_replacements() | |
# # Convert the generator to a list and print the results | |
# results = list(result) | |
# print(results, "RESULT") | |
# # Redact the PII from the results | |
# redacted_results = redact_pii_batch(results, replacements) | |
# print(redacted_results, "redacted_results") | |
# if __name__ == "__main__": | |
# main() | |