awacke1 commited on
Commit
aa29d50
·
verified ·
1 Parent(s): c7cbf62

Create presidio_helpers.py

Browse files
Files changed (1) hide show
  1. presidio_helpers.py +164 -0
presidio_helpers.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Helper methods for the Presidio Streamlit app
3
+ """
4
+ from typing import List, Optional, Tuple
5
+ import logging
6
+ import streamlit as st
7
+ from presidio_analyzer import (
8
+ AnalyzerEngine,
9
+ RecognizerResult,
10
+ RecognizerRegistry,
11
+ PatternRecognizer,
12
+ Pattern,
13
+ )
14
+ from presidio_analyzer.nlp_engine import NlpEngine
15
+ from presidio_anonymizer import AnonymizerEngine
16
+ from presidio_anonymizer.entities import OperatorConfig
17
+
18
+ logger = logging.getLogger("presidio-streamlit")
19
+
20
+ @st.cache_resource
21
+ def nlp_engine_and_registry(
22
+ model_family: str,
23
+ model_path: str,
24
+ ) -> Tuple[NlpEngine, RecognizerRegistry]:
25
+ """Create the NLP Engine instance based on the requested model."""
26
+ registry = RecognizerRegistry()
27
+
28
+ if model_family.lower() == "spacy":
29
+ from spacy.language import Language
30
+ import spacy
31
+ try:
32
+ nlp = spacy.load(model_path)
33
+ registry.load_predefined_recognizers()
34
+ registry.add_recognizer_from_dict({
35
+ "name": "spacy_recognizer",
36
+ "supported_language": "en",
37
+ "supported_entities": ["PERSON", "LOCATION", "ORGANIZATION", "DATE_TIME", "NRP"],
38
+ "model": model_path,
39
+ "package": "spacy",
40
+ })
41
+ return nlp, registry
42
+ except Exception as e:
43
+ logger.error(f"Failed to load spaCy model {model_path}: {str(e)}")
44
+ raise
45
+ elif model_family.lower() == "flair":
46
+ from flair.models import SequenceTagger
47
+ from flair.data import Sentence
48
+ try:
49
+ tagger = SequenceTagger.load(model_path)
50
+ registry.load_predefined_recognizers()
51
+ registry.add_recognizer_from_dict({
52
+ "name": "flair_recognizer",
53
+ "supported_language": "en",
54
+ "supported_entities": ["PERSON", "LOCATION", "ORGANIZATION"],
55
+ "model": model_path,
56
+ "package": "flair",
57
+ })
58
+ return tagger, registry
59
+ except Exception as e:
60
+ logger.error(f"Failed to load Flair model {model_path}: {str(e)}")
61
+ raise
62
+ elif model_family.lower() == "huggingface":
63
+ from transformers import pipeline
64
+ try:
65
+ nlp = pipeline("ner", model=model_path, tokenizer=model_path)
66
+ registry.load_predefined_recognizers()
67
+ registry.add_recognizer_from_dict({
68
+ "name": "huggingface_recognizer",
69
+ "supported_language": "en",
70
+ "supported_entities": ["PERSON", "LOCATION", "ORGANIZATION", "DATE_TIME"],
71
+ "model": model_path,
72
+ "package": "transformers",
73
+ })
74
+ return nlp, registry
75
+ except Exception as e:
76
+ logger.error(f"Failed to load HuggingFace model {model_path}: {str(e)}")
77
+ raise
78
+ else:
79
+ raise ValueError(f"Model family {model_family} not supported")
80
+
81
+ @st.cache_resource
82
+ def analyzer_engine(
83
+ model_family: str,
84
+ model_path: str,
85
+ ) -> AnalyzerEngine:
86
+ """Create the Analyzer Engine instance based on the requested model."""
87
+ nlp_engine, registry = nlp_engine_and_registry(model_family, model_path)
88
+ analyzer = AnalyzerEngine(registry=registry)
89
+ return analyzer
90
+
91
+ @st.cache_data
92
+ def get_supported_entities(model_family: str, model_path: str) -> List[str]:
93
+ """Return supported entities for the selected model."""
94
+ if model_family.lower() == "spacy":
95
+ return ["PERSON", "LOCATION", "ORGANIZATION", "DATE_TIME", "NRP"]
96
+ elif model_family.lower() == "huggingface":
97
+ return ["PERSON", "LOCATION", "ORGANIZATION", "DATE_TIME"]
98
+ elif model_family.lower() == "flair":
99
+ return ["PERSON", "LOCATION", "ORGANIZATION"]
100
+ return ["PERSON", "LOCATION", "ORGANIZATION"]
101
+
102
+ def analyze(
103
+ analyzer: AnalyzerEngine,
104
+ text: str,
105
+ entities: List[str],
106
+ language: str,
107
+ score_threshold: float,
108
+ return_decision_process: bool,
109
+ allow_list: List[str],
110
+ deny_list: List[str],
111
+ ) -> List[RecognizerResult]:
112
+ """Analyze text for PHI entities."""
113
+ results = analyzer.analyze(
114
+ text=text,
115
+ entities=entities,
116
+ language=language,
117
+ score_threshold=score_threshold,
118
+ return_decision_process=return_decision_process,
119
+ )
120
+ # Apply allow and deny lists
121
+ filtered_results = []
122
+ for result in results:
123
+ text_snippet = text[result.start:result.end].lower()
124
+ if any(word.lower() in text_snippet for word in allow_list):
125
+ continue
126
+ if any(word.lower() in text_snippet for word in deny_list):
127
+ filtered_results.append(result)
128
+ elif not deny_list:
129
+ filtered_results.append(result)
130
+ return filtered_results
131
+
132
+ def anonymize(
133
+ text: str,
134
+ operator: str,
135
+ analyze_results: List[RecognizerResult],
136
+ mask_char: str = "*",
137
+ number_of_chars: int = 15,
138
+ ) -> dict:
139
+ """Anonymize detected PHI entities in the text."""
140
+ anonymizer = AnonymizerEngine()
141
+ operator_config = {
142
+ "DEFAULT": OperatorConfig(operator, {})
143
+ }
144
+ if operator == "mask":
145
+ operator_config["DEFAULT"] = OperatorConfig(operator, {
146
+ "masking_char": mask_char,
147
+ "chars_to_mask": number_of_chars,
148
+ })
149
+ return anonymizer.anonymize(
150
+ text=text,
151
+ analyzer_results=analyze_results,
152
+ operators=operator_config,
153
+ )
154
+
155
+ def create_ad_hoc_deny_list_recognizer(
156
+ deny_list: Optional[List[str]] = None,
157
+ ) -> Optional[PatternRecognizer]:
158
+ """Create a recognizer for deny list items."""
159
+ if not deny_list:
160
+ return None
161
+ deny_list_recognizer = PatternRecognizer(
162
+ supported_entity="GENERIC_PII", deny_list=deny_list
163
+ )
164
+ return deny_list_recognizer