langdonholmes commited on
Commit
b97a311
·
1 Parent(s): e3f8caf

moved some functions to anonymize.py

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. anonymize.py +44 -0
  3. app.py +2 -25
  4. spacy_recognizer.py +2 -21
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
 
2
  __pycache__/spacy_recognizer.cpython-310.pyc
 
 
1
 
2
  __pycache__/spacy_recognizer.cpython-310.pyc
3
+ __pycache__/anonymize.cpython-310.pyc
anonymize.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from spacy_recognizer import CustomSpacyRecognizer
2
+ from presidio_analyzer.nlp_engine import NlpEngineProvider
3
+ from presidio_anonymizer import AnonymizerEngine
4
+ from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
5
+ from presidio_anonymizer.entities import OperatorConfig
6
+ import pandas as pd
7
+ from json import JSONEncoder
8
+ import json
9
+ import warnings
10
+ import os
11
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
12
+ warnings.filterwarnings('ignore')
13
+
14
+ def prepare_analyzer(configuration):
15
+ """Return AnalyzerEngine."""
16
+
17
+ spacy_recognizer = CustomSpacyRecognizer()
18
+
19
+ print('Hallej')
20
+
21
+ # Create NLP engine based on configuration
22
+ provider = NlpEngineProvider(nlp_configuration=configuration)
23
+ nlp_engine = provider.create_engine()
24
+
25
+ # add rule-based recognizers
26
+ registry = RecognizerRegistry()
27
+ registry.load_predefined_recognizers(nlp_engine=nlp_engine)
28
+ registry.add_recognizer(spacy_recognizer)
29
+
30
+ # remove the nlp engine we passed, to use custom label mappings
31
+ registry.remove_recognizer("SpacyRecognizer")
32
+
33
+ analyzer = AnalyzerEngine(nlp_engine=nlp_engine,
34
+ registry=registry,
35
+ supported_languages=["en"])
36
+
37
+ return analyzer
38
+
39
+ def generate_surrogate(name):
40
+ """Return appropriate surrogate name from text string"""
41
+ if "John" in name:
42
+ return "Jill"
43
+ else:
44
+ return "SURROGATE_NAME"
app.py CHANGED
@@ -1,10 +1,8 @@
1
 
2
  """Streamlit app for Student Name Detection models."""
3
 
4
- from spacy_recognizer import CustomSpacyRecognizer
5
- from presidio_analyzer.nlp_engine import NlpEngineProvider
6
  from presidio_anonymizer import AnonymizerEngine
7
- from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
8
  from presidio_anonymizer.entities import OperatorConfig
9
  import pandas as pd
10
  from annotated_text import annotated_text
@@ -21,27 +19,13 @@ warnings.filterwarnings('ignore')
21
  def analyzer_engine():
22
  """Return AnalyzerEngine."""
23
 
24
- spacy_recognizer = CustomSpacyRecognizer()
25
-
26
  configuration = {
27
  "nlp_engine_name": "spacy",
28
  "models": [
29
  {"lang_code": "en", "model_name": "en_student_name_detector"}],
30
  }
31
 
32
- # Create NLP engine based on configuration
33
- provider = NlpEngineProvider(nlp_configuration=configuration)
34
- nlp_engine = provider.create_engine()
35
-
36
- registry = RecognizerRegistry()
37
- # add rule-based recognizers
38
- registry.load_predefined_recognizers(nlp_engine=nlp_engine)
39
- registry.add_recognizer(spacy_recognizer)
40
- # remove the nlp engine we passed, to use custom label mappings
41
- registry.remove_recognizer("SpacyRecognizer")
42
-
43
- analyzer = AnalyzerEngine(nlp_engine=nlp_engine,
44
- registry=registry, supported_languages=["en"])
45
 
46
  return analyzer
47
 
@@ -60,13 +44,6 @@ def analyze(**kwargs):
60
  kwargs["entities"] = None
61
  return analyzer_engine().analyze(**kwargs)
62
 
63
- def generate_surrogate(name):
64
- """Return appropriate surrogate name from text string"""
65
- if "John" in name:
66
- return "Jill"
67
- else:
68
- return "SURROGATE_NAME"
69
-
70
  def anonymize(text, analyze_results):
71
  """Anonymize identified input using Presidio Anonymizer."""
72
  if not text:
 
1
 
2
  """Streamlit app for Student Name Detection models."""
3
 
4
+ from anonymize import prepare_analyzer, generate_surrogate
 
5
  from presidio_anonymizer import AnonymizerEngine
 
6
  from presidio_anonymizer.entities import OperatorConfig
7
  import pandas as pd
8
  from annotated_text import annotated_text
 
19
  def analyzer_engine():
20
  """Return AnalyzerEngine."""
21
 
 
 
22
  configuration = {
23
  "nlp_engine_name": "spacy",
24
  "models": [
25
  {"lang_code": "en", "model_name": "en_student_name_detector"}],
26
  }
27
 
28
+ analyzer = prepare_analyzer(configuration)
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  return analyzer
31
 
 
44
  kwargs["entities"] = None
45
  return analyzer_engine().analyze(**kwargs)
46
 
 
 
 
 
 
 
 
47
  def anonymize(text, analyze_results):
48
  """Anonymize identified input using Presidio Anonymizer."""
49
  if not text:
spacy_recognizer.py CHANGED
@@ -11,41 +11,22 @@ from presidio_analyzer.predefined_recognizers.spacy_recognizer import SpacyRecog
11
 
12
  logger = logging.getLogger("presidio-analyzer")
13
 
14
-
15
  class CustomSpacyRecognizer(LocalRecognizer):
16
-
17
  ENTITIES = [
18
- "LOCATION",
19
- "PERSON",
20
  "STUDENT",
21
- "NRP",
22
- "ORGANIZATION",
23
- "DATE_TIME",
24
  ]
25
 
26
- DEFAULT_EXPLANATION = "Identified as {} by Spacy's Named Entity Recognition"
27
 
28
  CHECK_LABEL_GROUPS = [
29
- ({"LOCATION"}, {"LOC", "LOCATION", "STREET_ADDRESS", "COORDINATE"}),
30
- ({"PERSON"}, {"PER", "PERSON"}),
31
  ({"STUDENT"}, {"STUDENT"}),
32
- ({"NRP"}, {"NORP", "NRP"}),
33
- ({"ORGANIZATION"}, {"ORG"}),
34
- ({"DATE_TIME"}, {"DATE_TIME"}),
35
  ]
36
 
37
  MODEL_LANGUAGES = {
38
  "en": "langdonholmes/en_student_name_detector",
39
  }
40
 
41
- PRESIDIO_EQUIVALENCES = {
42
- "PER": "PERSON",
43
- "LOC": "LOCATION",
44
- "ORG": "ORGANIZATION",
45
- "NROP": "NRP",
46
- "DATE_TIME": "DATE_TIME",
47
- }
48
-
49
  def __init__(
50
  self,
51
  supported_language: str = "en",
 
11
 
12
  logger = logging.getLogger("presidio-analyzer")
13
 
 
14
  class CustomSpacyRecognizer(LocalRecognizer):
15
+
16
  ENTITIES = [
 
 
17
  "STUDENT",
 
 
 
18
  ]
19
 
20
+ DEFAULT_EXPLANATION = "Identified as {} by a Student Name Detection Model"
21
 
22
  CHECK_LABEL_GROUPS = [
 
 
23
  ({"STUDENT"}, {"STUDENT"}),
 
 
 
24
  ]
25
 
26
  MODEL_LANGUAGES = {
27
  "en": "langdonholmes/en_student_name_detector",
28
  }
29
 
 
 
 
 
 
 
 
 
30
  def __init__(
31
  self,
32
  supported_language: str = "en",