Spaces:

langdonholmes
/

piilo

Running

App Files Files Community

langdonholmes commited on Feb 5, 2023

Commit

b97a311

1 Parent(s): e3f8caf

moved some functions to anonymize.py

Browse files

Files changed (4) hide show

.gitignore +1 -0
anonymize.py +44 -0
app.py +2 -25
spacy_recognizer.py +2 -21

.gitignore CHANGED Viewed

	@@ -1,2 +1,3 @@
1
2	__pycache__/spacy_recognizer.cpython-310.pyc


1
2	__pycache__/spacy_recognizer.cpython-310.pyc
3	+ __pycache__/anonymize.cpython-310.pyc

anonymize.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from spacy_recognizer import CustomSpacyRecognizer
+from presidio_analyzer.nlp_engine import NlpEngineProvider
+from presidio_anonymizer import AnonymizerEngine
+from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
+from presidio_anonymizer.entities import OperatorConfig
+import pandas as pd
+from json import JSONEncoder
+import json
+import warnings
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+warnings.filterwarnings('ignore')
+def prepare_analyzer(configuration):
+    """Return AnalyzerEngine."""
+    spacy_recognizer = CustomSpacyRecognizer()
+    print('Hallej')
+    # Create NLP engine based on configuration
+    provider = NlpEngineProvider(nlp_configuration=configuration)
+    nlp_engine = provider.create_engine()
+    # add rule-based recognizers
+    registry = RecognizerRegistry()
+    registry.load_predefined_recognizers(nlp_engine=nlp_engine)
+    registry.add_recognizer(spacy_recognizer)
+    # remove the nlp engine we passed, to use custom label mappings
+    registry.remove_recognizer("SpacyRecognizer")
+    analyzer = AnalyzerEngine(nlp_engine=nlp_engine,
+                              registry=registry,
+                              supported_languages=["en"])
+    return analyzer
+def generate_surrogate(name):
+    """Return appropriate surrogate name from text string"""
+    if "John" in name:
+        return "Jill"
+    else:
+        return "SURROGATE_NAME"

app.py CHANGED Viewed

@@ -1,10 +1,8 @@
 """Streamlit app for Student Name Detection models."""
-from spacy_recognizer import CustomSpacyRecognizer
-from presidio_analyzer.nlp_engine import NlpEngineProvider
 from presidio_anonymizer import AnonymizerEngine
-from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
 from presidio_anonymizer.entities import OperatorConfig
 import pandas as pd
 from annotated_text import annotated_text
@@ -21,27 +19,13 @@ warnings.filterwarnings('ignore')
 def analyzer_engine():
     """Return AnalyzerEngine."""
-    spacy_recognizer = CustomSpacyRecognizer()
     configuration = {
         "nlp_engine_name": "spacy",
         "models": [
             {"lang_code": "en", "model_name": "en_student_name_detector"}],
     }
-    # Create NLP engine based on configuration
-    provider = NlpEngineProvider(nlp_configuration=configuration)
-    nlp_engine = provider.create_engine()
-    registry = RecognizerRegistry()
-    # add rule-based recognizers
-    registry.load_predefined_recognizers(nlp_engine=nlp_engine)
-    registry.add_recognizer(spacy_recognizer)
-    # remove the nlp engine we passed, to use custom label mappings
-    registry.remove_recognizer("SpacyRecognizer")
-    analyzer = AnalyzerEngine(nlp_engine=nlp_engine,
-                              registry=registry, supported_languages=["en"])
     return analyzer
@@ -60,13 +44,6 @@ def analyze(**kwargs):
         kwargs["entities"] = None
     return analyzer_engine().analyze(**kwargs)
-def generate_surrogate(name):
-    """Return appropriate surrogate name from text string"""
-    if "John" in name:
-        return "Jill"
-    else:
-        return "SURROGATE_NAME"
 def anonymize(text, analyze_results):
     """Anonymize identified input using Presidio Anonymizer."""
     if not text:

 """Streamlit app for Student Name Detection models."""
+from anonymize import prepare_analyzer, generate_surrogate
 from presidio_anonymizer import AnonymizerEngine
 from presidio_anonymizer.entities import OperatorConfig
 import pandas as pd
 from annotated_text import annotated_text
 def analyzer_engine():
     """Return AnalyzerEngine."""
     configuration = {
         "nlp_engine_name": "spacy",
         "models": [
             {"lang_code": "en", "model_name": "en_student_name_detector"}],
     }
+    analyzer = prepare_analyzer(configuration)
     return analyzer
         kwargs["entities"] = None
     return analyzer_engine().analyze(**kwargs)
 def anonymize(text, analyze_results):
     """Anonymize identified input using Presidio Anonymizer."""
     if not text:

spacy_recognizer.py CHANGED Viewed

@@ -11,41 +11,22 @@ from presidio_analyzer.predefined_recognizers.spacy_recognizer import SpacyRecog
 logger = logging.getLogger("presidio-analyzer")
 class CustomSpacyRecognizer(LocalRecognizer):
     ENTITIES = [
-        "LOCATION",
-        "PERSON",
         "STUDENT",
-        "NRP",
-        "ORGANIZATION",
-        "DATE_TIME",
     ]
-    DEFAULT_EXPLANATION = "Identified as {} by Spacy's Named Entity Recognition"
     CHECK_LABEL_GROUPS = [
-        ({"LOCATION"}, {"LOC", "LOCATION", "STREET_ADDRESS", "COORDINATE"}),
-        ({"PERSON"}, {"PER", "PERSON"}),
         ({"STUDENT"}, {"STUDENT"}),
-        ({"NRP"}, {"NORP", "NRP"}),
-        ({"ORGANIZATION"}, {"ORG"}),
-        ({"DATE_TIME"}, {"DATE_TIME"}),
     ]
     MODEL_LANGUAGES = {
         "en": "langdonholmes/en_student_name_detector",
     }
-    PRESIDIO_EQUIVALENCES = {
-        "PER": "PERSON",
-        "LOC": "LOCATION",
-        "ORG": "ORGANIZATION",
-        "NROP": "NRP",
-        "DATE_TIME": "DATE_TIME",
-    }
     def __init__(
         self,
         supported_language: str = "en",

 logger = logging.getLogger("presidio-analyzer")
 class CustomSpacyRecognizer(LocalRecognizer):
     ENTITIES = [
         "STUDENT",
     ]
+    DEFAULT_EXPLANATION = "Identified as {} by a Student Name Detection Model"
     CHECK_LABEL_GROUPS = [
         ({"STUDENT"}, {"STUDENT"}),
     ]
     MODEL_LANGUAGES = {
         "en": "langdonholmes/en_student_name_detector",
     }
     def __init__(
         self,
         supported_language: str = "en",