|
import argparse |
|
import json |
|
import re |
|
import uuid |
|
from pathlib import Path |
|
import gensim |
|
from concrete.ml.common.serialization.loaders import load |
|
from transformers import AutoTokenizer, AutoModel |
|
from utils_demo import get_batch_text_representation |
|
|
|
def load_models(): |
|
base_dir = Path(__file__).parent / "models" |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2") |
|
embeddings_model = AutoModel.from_pretrained("obi/deid_roberta_i2b2") |
|
|
|
with open(base_dir / "cml_logreg.model", "r") as model_file: |
|
fhe_ner_detection = load(file=model_file) |
|
return embeddings_model, tokenizer, fhe_ner_detection |
|
|
|
def anonymize_text(text, embeddings_model, tokenizer, fhe_ner_detection): |
|
token_pattern = r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)" |
|
tokens = re.findall(token_pattern, text) |
|
uuid_map = {} |
|
processed_tokens = [] |
|
|
|
for token in tokens: |
|
if token.strip() and re.match(r"\w+", token): |
|
x = get_batch_text_representation([token], embeddings_model, tokenizer) |
|
prediction_proba = fhe_ner_detection.predict_proba(x) |
|
probability = prediction_proba[0][1] |
|
prediction = probability >= 0.5 |
|
if prediction: |
|
if token not in uuid_map: |
|
uuid_map[token] = str(uuid.uuid4())[:8] |
|
processed_tokens.append(uuid_map[token]) |
|
else: |
|
processed_tokens.append(token) |
|
else: |
|
processed_tokens.append(token) |
|
|
|
anonymized_text = ''.join(processed_tokens) |
|
return anonymized_text, uuid_map |
|
|
|
def main(): |
|
parser = argparse.ArgumentParser(description="Anonymize named entities in a text file and save the mapping to a JSON file.") |
|
parser.add_argument("file_path", type=str, help="The path to the file to be processed.") |
|
args = parser.parse_args() |
|
|
|
embeddings_model, tokenizer, fhe_ner_detection = load_models() |
|
|
|
|
|
with open(args.file_path, 'r', encoding='utf-8') as file: |
|
text = file.read() |
|
|
|
|
|
original_file_path = Path(__file__).parent / "files" / "original_document.txt" |
|
with open(original_file_path, 'w', encoding='utf-8') as original_file: |
|
original_file.write(text) |
|
|
|
|
|
anonymized_text, uuid_map = anonymize_text(text, embeddings_model, tokenizer, fhe_ner_detection) |
|
|
|
|
|
anonymized_file_path = Path(__file__).parent / "files" / "anonymized_document.txt" |
|
with open(anonymized_file_path, 'w', encoding='utf-8') as anonymized_file: |
|
anonymized_file.write(anonymized_text) |
|
|
|
|
|
mapping_path = Path(args.file_path).stem + "_uuid_mapping.json" |
|
with open(mapping_path, 'w', encoding='utf-8') as file: |
|
json.dump(uuid_map, file, indent=4, sort_keys=True) |
|
|
|
print(f"Original text saved to {original_file_path}") |
|
print(f"Anonymized text saved to {anonymized_file_path}") |
|
print(f"UUID mapping saved to {mapping_path}") |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|