Spaces:
Sleeping
Sleeping
Create presidio_helpers.py
Browse files- presidio_helpers.py +164 -0
presidio_helpers.py
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Helper methods for the Presidio Streamlit app
|
3 |
+
"""
|
4 |
+
from typing import List, Optional, Tuple
|
5 |
+
import logging
|
6 |
+
import streamlit as st
|
7 |
+
from presidio_analyzer import (
|
8 |
+
AnalyzerEngine,
|
9 |
+
RecognizerResult,
|
10 |
+
RecognizerRegistry,
|
11 |
+
PatternRecognizer,
|
12 |
+
Pattern,
|
13 |
+
)
|
14 |
+
from presidio_analyzer.nlp_engine import NlpEngine
|
15 |
+
from presidio_anonymizer import AnonymizerEngine
|
16 |
+
from presidio_anonymizer.entities import OperatorConfig
|
17 |
+
|
18 |
+
logger = logging.getLogger("presidio-streamlit")
|
19 |
+
|
20 |
+
@st.cache_resource
|
21 |
+
def nlp_engine_and_registry(
|
22 |
+
model_family: str,
|
23 |
+
model_path: str,
|
24 |
+
) -> Tuple[NlpEngine, RecognizerRegistry]:
|
25 |
+
"""Create the NLP Engine instance based on the requested model."""
|
26 |
+
registry = RecognizerRegistry()
|
27 |
+
|
28 |
+
if model_family.lower() == "spacy":
|
29 |
+
from spacy.language import Language
|
30 |
+
import spacy
|
31 |
+
try:
|
32 |
+
nlp = spacy.load(model_path)
|
33 |
+
registry.load_predefined_recognizers()
|
34 |
+
registry.add_recognizer_from_dict({
|
35 |
+
"name": "spacy_recognizer",
|
36 |
+
"supported_language": "en",
|
37 |
+
"supported_entities": ["PERSON", "LOCATION", "ORGANIZATION", "DATE_TIME", "NRP"],
|
38 |
+
"model": model_path,
|
39 |
+
"package": "spacy",
|
40 |
+
})
|
41 |
+
return nlp, registry
|
42 |
+
except Exception as e:
|
43 |
+
logger.error(f"Failed to load spaCy model {model_path}: {str(e)}")
|
44 |
+
raise
|
45 |
+
elif model_family.lower() == "flair":
|
46 |
+
from flair.models import SequenceTagger
|
47 |
+
from flair.data import Sentence
|
48 |
+
try:
|
49 |
+
tagger = SequenceTagger.load(model_path)
|
50 |
+
registry.load_predefined_recognizers()
|
51 |
+
registry.add_recognizer_from_dict({
|
52 |
+
"name": "flair_recognizer",
|
53 |
+
"supported_language": "en",
|
54 |
+
"supported_entities": ["PERSON", "LOCATION", "ORGANIZATION"],
|
55 |
+
"model": model_path,
|
56 |
+
"package": "flair",
|
57 |
+
})
|
58 |
+
return tagger, registry
|
59 |
+
except Exception as e:
|
60 |
+
logger.error(f"Failed to load Flair model {model_path}: {str(e)}")
|
61 |
+
raise
|
62 |
+
elif model_family.lower() == "huggingface":
|
63 |
+
from transformers import pipeline
|
64 |
+
try:
|
65 |
+
nlp = pipeline("ner", model=model_path, tokenizer=model_path)
|
66 |
+
registry.load_predefined_recognizers()
|
67 |
+
registry.add_recognizer_from_dict({
|
68 |
+
"name": "huggingface_recognizer",
|
69 |
+
"supported_language": "en",
|
70 |
+
"supported_entities": ["PERSON", "LOCATION", "ORGANIZATION", "DATE_TIME"],
|
71 |
+
"model": model_path,
|
72 |
+
"package": "transformers",
|
73 |
+
})
|
74 |
+
return nlp, registry
|
75 |
+
except Exception as e:
|
76 |
+
logger.error(f"Failed to load HuggingFace model {model_path}: {str(e)}")
|
77 |
+
raise
|
78 |
+
else:
|
79 |
+
raise ValueError(f"Model family {model_family} not supported")
|
80 |
+
|
81 |
+
@st.cache_resource
|
82 |
+
def analyzer_engine(
|
83 |
+
model_family: str,
|
84 |
+
model_path: str,
|
85 |
+
) -> AnalyzerEngine:
|
86 |
+
"""Create the Analyzer Engine instance based on the requested model."""
|
87 |
+
nlp_engine, registry = nlp_engine_and_registry(model_family, model_path)
|
88 |
+
analyzer = AnalyzerEngine(registry=registry)
|
89 |
+
return analyzer
|
90 |
+
|
91 |
+
@st.cache_data
|
92 |
+
def get_supported_entities(model_family: str, model_path: str) -> List[str]:
|
93 |
+
"""Return supported entities for the selected model."""
|
94 |
+
if model_family.lower() == "spacy":
|
95 |
+
return ["PERSON", "LOCATION", "ORGANIZATION", "DATE_TIME", "NRP"]
|
96 |
+
elif model_family.lower() == "huggingface":
|
97 |
+
return ["PERSON", "LOCATION", "ORGANIZATION", "DATE_TIME"]
|
98 |
+
elif model_family.lower() == "flair":
|
99 |
+
return ["PERSON", "LOCATION", "ORGANIZATION"]
|
100 |
+
return ["PERSON", "LOCATION", "ORGANIZATION"]
|
101 |
+
|
102 |
+
def analyze(
|
103 |
+
analyzer: AnalyzerEngine,
|
104 |
+
text: str,
|
105 |
+
entities: List[str],
|
106 |
+
language: str,
|
107 |
+
score_threshold: float,
|
108 |
+
return_decision_process: bool,
|
109 |
+
allow_list: List[str],
|
110 |
+
deny_list: List[str],
|
111 |
+
) -> List[RecognizerResult]:
|
112 |
+
"""Analyze text for PHI entities."""
|
113 |
+
results = analyzer.analyze(
|
114 |
+
text=text,
|
115 |
+
entities=entities,
|
116 |
+
language=language,
|
117 |
+
score_threshold=score_threshold,
|
118 |
+
return_decision_process=return_decision_process,
|
119 |
+
)
|
120 |
+
# Apply allow and deny lists
|
121 |
+
filtered_results = []
|
122 |
+
for result in results:
|
123 |
+
text_snippet = text[result.start:result.end].lower()
|
124 |
+
if any(word.lower() in text_snippet for word in allow_list):
|
125 |
+
continue
|
126 |
+
if any(word.lower() in text_snippet for word in deny_list):
|
127 |
+
filtered_results.append(result)
|
128 |
+
elif not deny_list:
|
129 |
+
filtered_results.append(result)
|
130 |
+
return filtered_results
|
131 |
+
|
132 |
+
def anonymize(
|
133 |
+
text: str,
|
134 |
+
operator: str,
|
135 |
+
analyze_results: List[RecognizerResult],
|
136 |
+
mask_char: str = "*",
|
137 |
+
number_of_chars: int = 15,
|
138 |
+
) -> dict:
|
139 |
+
"""Anonymize detected PHI entities in the text."""
|
140 |
+
anonymizer = AnonymizerEngine()
|
141 |
+
operator_config = {
|
142 |
+
"DEFAULT": OperatorConfig(operator, {})
|
143 |
+
}
|
144 |
+
if operator == "mask":
|
145 |
+
operator_config["DEFAULT"] = OperatorConfig(operator, {
|
146 |
+
"masking_char": mask_char,
|
147 |
+
"chars_to_mask": number_of_chars,
|
148 |
+
})
|
149 |
+
return anonymizer.anonymize(
|
150 |
+
text=text,
|
151 |
+
analyzer_results=analyze_results,
|
152 |
+
operators=operator_config,
|
153 |
+
)
|
154 |
+
|
155 |
+
def create_ad_hoc_deny_list_recognizer(
|
156 |
+
deny_list: Optional[List[str]] = None,
|
157 |
+
) -> Optional[PatternRecognizer]:
|
158 |
+
"""Create a recognizer for deny list items."""
|
159 |
+
if not deny_list:
|
160 |
+
return None
|
161 |
+
deny_list_recognizer = PatternRecognizer(
|
162 |
+
supported_entity="GENERIC_PII", deny_list=deny_list
|
163 |
+
)
|
164 |
+
return deny_list_recognizer
|