File size: 3,210 Bytes
df6182e d0b1031 df6182e 628fe8f d0b1031 df6182e d0b1031 df6182e d0b1031 df6182e d0b1031 df6182e 628fe8f df6182e d0b1031 df6182e 628fe8f df6182e d0b1031 628fe8f df6182e 628fe8f df6182e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import argparse
import json
import re
import uuid
from pathlib import Path
import gensim
from concrete.ml.common.serialization.loaders import load
from transformers import AutoTokenizer, AutoModel
from utils_demo import get_batch_text_representation
def load_models():
base_dir = Path(__file__).parent / "models"
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
embeddings_model = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
with open(base_dir / "cml_logreg.model", "r") as model_file:
fhe_ner_detection = load(file=model_file)
return embeddings_model, tokenizer, fhe_ner_detection
def anonymize_text(text, embeddings_model, tokenizer, fhe_ner_detection):
token_pattern = r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)"
tokens = re.findall(token_pattern, text)
uuid_map = {}
processed_tokens = []
for token in tokens:
if token.strip() and re.match(r"\w+", token): # If the token is a word
x = get_batch_text_representation([token], embeddings_model, tokenizer)
prediction_proba = fhe_ner_detection.predict_proba(x)
probability = prediction_proba[0][1]
prediction = probability >= 0.5
if prediction:
if token not in uuid_map:
uuid_map[token] = str(uuid.uuid4())[:8]
processed_tokens.append(uuid_map[token])
else:
processed_tokens.append(token)
else:
processed_tokens.append(token) # Preserve punctuation and spaces as is
anonymized_text = ''.join(processed_tokens)
return anonymized_text, uuid_map
def main():
parser = argparse.ArgumentParser(description="Anonymize named entities in a text file and save the mapping to a JSON file.")
parser.add_argument("file_path", type=str, help="The path to the file to be processed.")
args = parser.parse_args()
embeddings_model, tokenizer, fhe_ner_detection = load_models()
# Read the input file
with open(args.file_path, 'r', encoding='utf-8') as file:
text = file.read()
# Save the original text to its specified file
original_file_path = Path(__file__).parent / "files" / "original_document.txt"
with open(original_file_path, 'w', encoding='utf-8') as original_file:
original_file.write(text)
# Anonymize the text
anonymized_text, uuid_map = anonymize_text(text, embeddings_model, tokenizer, fhe_ner_detection)
# Save the anonymized text to its specified file
anonymized_file_path = Path(__file__).parent / "files" / "anonymized_document.txt"
with open(anonymized_file_path, 'w', encoding='utf-8') as anonymized_file:
anonymized_file.write(anonymized_text)
# Save the UUID mapping to a JSON file
mapping_path = Path(args.file_path).stem + "_uuid_mapping.json"
with open(mapping_path, 'w', encoding='utf-8') as file:
json.dump(uuid_map, file, indent=4, sort_keys=True)
print(f"Original text saved to {original_file_path}")
print(f"Anonymized text saved to {anonymized_file_path}")
print(f"UUID mapping saved to {mapping_path}")
if __name__ == "__main__":
main()
|