Spaces:
Sleeping
Sleeping
langdonholmes
commited on
Commit
·
9637704
1
Parent(s):
0e58a18
custom anonymizer as class
Browse files- .gitignore +0 -1
- app.py +4 -6
- piilo/engines/analyzer.py +21 -19
- piilo/engines/anonymizer.py +2 -1
- piilo/main.py +4 -4
.gitignore
CHANGED
@@ -1,3 +1,2 @@
|
|
1 |
-
__pycache__/*
|
2 |
.ipynb_checkpoints
|
3 |
__pycache__
|
|
|
|
|
1 |
.ipynb_checkpoints
|
2 |
__pycache__
|
app.py
CHANGED
@@ -9,8 +9,8 @@ import pandas as pd
|
|
9 |
import streamlit as st
|
10 |
from annotated_text import annotated_text
|
11 |
|
12 |
-
from piilo.engines.analyzer import
|
13 |
-
from piilo.engines.anonymizer import
|
14 |
|
15 |
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
16 |
warnings.filterwarnings('ignore')
|
@@ -26,14 +26,12 @@ def analyzer_engine():
|
|
26 |
{'lang_code': 'en', 'model_name': 'en_student_name_detector'}],
|
27 |
}
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
return analyzer
|
32 |
|
33 |
@st.cache(allow_output_mutation=True)
|
34 |
def anonymizer_engine():
|
35 |
'''Return generate surrogate anonymizer.'''
|
36 |
-
return
|
37 |
|
38 |
def annotate(text, st_analyze_results, st_entities):
|
39 |
tokens = []
|
|
|
9 |
import streamlit as st
|
10 |
from annotated_text import annotated_text
|
11 |
|
12 |
+
from piilo.engines.analyzer import CustomAnalyzer
|
13 |
+
from piilo.engines.anonymizer import SurrogateAnonymizer
|
14 |
|
15 |
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
16 |
warnings.filterwarnings('ignore')
|
|
|
26 |
{'lang_code': 'en', 'model_name': 'en_student_name_detector'}],
|
27 |
}
|
28 |
|
29 |
+
return CustomAnalyzer(configuration=configuration)
|
|
|
|
|
30 |
|
31 |
@st.cache(allow_output_mutation=True)
|
32 |
def anonymizer_engine():
|
33 |
'''Return generate surrogate anonymizer.'''
|
34 |
+
return SurrogateAnonymizer()
|
35 |
|
36 |
def annotate(text, st_analyze_results, st_entities):
|
37 |
tokens = []
|
piilo/engines/analyzer.py
CHANGED
@@ -117,25 +117,27 @@ class CustomSpacyRecognizer(LocalRecognizer):
|
|
117 |
[entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
|
118 |
)
|
119 |
|
120 |
-
|
121 |
-
'''
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
registry = RecognizerRegistry()
|
131 |
-
registry.load_predefined_recognizers(nlp_engine=nlp_engine)
|
132 |
-
registry.add_recognizer(spacy_recognizer)
|
133 |
|
134 |
-
|
135 |
-
|
|
|
|
|
136 |
|
137 |
-
|
138 |
-
|
139 |
-
supported_languages=['en'])
|
140 |
|
141 |
-
|
|
|
|
|
|
|
|
|
|
117 |
[entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
|
118 |
)
|
119 |
|
120 |
+
class CustomAnalyzer(AnalyzerEngine):
|
121 |
+
'''Custom Analyzer Engine for Presidio.'''
|
122 |
+
|
123 |
+
def __init__(self, configuration):
|
124 |
+
|
125 |
+
spacy_recognizer = CustomSpacyRecognizer()
|
126 |
+
|
127 |
+
# Create NLP engine based on configuration
|
128 |
+
provider = NlpEngineProvider(nlp_configuration=configuration)
|
129 |
+
nlp_engine = provider.create_engine()
|
|
|
|
|
|
|
130 |
|
131 |
+
# add rule-based recognizers
|
132 |
+
registry = RecognizerRegistry()
|
133 |
+
registry.load_predefined_recognizers(nlp_engine=nlp_engine)
|
134 |
+
registry.add_recognizer(spacy_recognizer)
|
135 |
|
136 |
+
# remove the nlp engine we passed, to use custom label mappings
|
137 |
+
registry.remove_recognizer('SpacyRecognizer')
|
|
|
138 |
|
139 |
+
super().__init__(
|
140 |
+
nlp_engine=nlp_engine,
|
141 |
+
registry=registry,
|
142 |
+
supported_languages=['en']
|
143 |
+
)
|
piilo/engines/anonymizer.py
CHANGED
@@ -14,6 +14,7 @@ data = Path(__file__).parent.parent.parent / 'data'
|
|
14 |
name_table = data / 'ascii_names.parquet'
|
15 |
|
16 |
logger = logging.getLogger('anonymizer')
|
|
|
17 |
class NameDatabase(NameDataset):
|
18 |
'''A wrapper around the names_dataset.NameDataset class.
|
19 |
'''
|
@@ -45,7 +46,7 @@ class NameDatabase(NameDataset):
|
|
45 |
country = NameWrapper(self.search(last_names)).country
|
46 |
return country if country else None
|
47 |
|
48 |
-
class
|
49 |
'''A wrapper around the presidio_anonymizer.AnonymizerEngine class.
|
50 |
'''
|
51 |
|
|
|
14 |
name_table = data / 'ascii_names.parquet'
|
15 |
|
16 |
logger = logging.getLogger('anonymizer')
|
17 |
+
|
18 |
class NameDatabase(NameDataset):
|
19 |
'''A wrapper around the names_dataset.NameDataset class.
|
20 |
'''
|
|
|
46 |
country = NameWrapper(self.search(last_names)).country
|
47 |
return country if country else None
|
48 |
|
49 |
+
class SurrogateAnonymizer(AnonymizerEngine):
|
50 |
'''A wrapper around the presidio_anonymizer.AnonymizerEngine class.
|
51 |
'''
|
52 |
|
piilo/main.py
CHANGED
@@ -5,8 +5,8 @@ import logging
|
|
5 |
from fastapi import FastAPI
|
6 |
from fastapi.middleware.cors import CORSMiddleware
|
7 |
|
8 |
-
from engines.analyzer import
|
9 |
-
from engines.anonymizer import
|
10 |
from models.anonymize import AnonymizeRequest, AnonymizeResponse
|
11 |
|
12 |
configuration = {
|
@@ -19,8 +19,8 @@ logger = logging.getLogger('api')
|
|
19 |
logging.basicConfig(level=logging.INFO)
|
20 |
|
21 |
logger.info("Loading Custom Presidio Analyzer and Anonymizer...")
|
22 |
-
analyzer =
|
23 |
-
anonymizer =
|
24 |
logger.info("Loading Successful!")
|
25 |
|
26 |
app = FastAPI()
|
|
|
5 |
from fastapi import FastAPI
|
6 |
from fastapi.middleware.cors import CORSMiddleware
|
7 |
|
8 |
+
from engines.analyzer import CustomAnalyzer
|
9 |
+
from engines.anonymizer import SurrogateAnonymizer
|
10 |
from models.anonymize import AnonymizeRequest, AnonymizeResponse
|
11 |
|
12 |
configuration = {
|
|
|
19 |
logging.basicConfig(level=logging.INFO)
|
20 |
|
21 |
logger.info("Loading Custom Presidio Analyzer and Anonymizer...")
|
22 |
+
analyzer = CustomAnalyzer(configuration)
|
23 |
+
anonymizer = SurrogateAnonymizer()
|
24 |
logger.info("Loading Successful!")
|
25 |
|
26 |
app = FastAPI()
|