Spaces:

omri374
/

presidio

Build error

App Files Files Community

omri374 commited on May 26, 2023

Commit

28a039d

1 Parent(s): 89035f8

Upload 10 files

Browse files

Files changed (10) hide show

Dockerfile +32 -0
demo_text.txt +12 -0
flair_recognizer.py +198 -0
index.md +26 -0
openai_fake_data_generator.py +55 -0
presidio_helpers.py +230 -0
presidio_nlp_engine_config.py +135 -0
presidio_streamlit.py +280 -0
requirements.txt +10 -4
text_analytics_wrapper.py +121 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,32 @@

+FROM python:3.9-slim
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    software-properties-common \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip3 install -r requirements.txt
+RUN pip3 install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
+RUN pip3 install https://huggingface.co/spacy/en_core_web_lg/resolve/main/en_core_web_lg-any-py3-none-any.whl
+EXPOSE 7860
+COPY . /code
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+COPY --chown=user . $HOME/app
+HEALTHCHECK CMD curl --fail http://localhost:7860/_stcore/health
+CMD python -m streamlit run presidio_streamlit.py --server.port=7860 --server.address=0.0.0.0

demo_text.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+Here are a few example sentences we currently support:
+Hello, my name is David Johnson and I live in Maine.
+My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.
+On September 18 I visited microsoft.com and sent an email to [email protected],  from the IP 192.168.0.1.
+My passport: 191280342 and my phone number: (212) 555-1234.
+This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544?
+Kate's social security number is 078-05-1126.  Her driver license? it is 1234567A.

flair_recognizer.py ADDED Viewed

	@@ -0,0 +1,198 @@

+## Taken from https://github.com/microsoft/presidio/blob/main/docs/samples/python/flair_recognizer.py
+import logging
+from typing import Optional, List, Tuple, Set
+from presidio_analyzer import (
+    RecognizerResult,
+    EntityRecognizer,
+    AnalysisExplanation,
+)
+from presidio_analyzer.nlp_engine import NlpArtifacts
+from flair.data import Sentence
+from flair.models import SequenceTagger
+logger = logging.getLogger("presidio-analyzer")
+class FlairRecognizer(EntityRecognizer):
+    """
+    Wrapper for a flair model, if needed to be used within Presidio Analyzer.
+    :example:
+    >from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
+    >flair_recognizer = FlairRecognizer()
+    >registry = RecognizerRegistry()
+    >registry.add_recognizer(flair_recognizer)
+    >analyzer = AnalyzerEngine(registry=registry)
+    >results = analyzer.analyze(
+    >    "My name is Christopher and I live in Irbid.",
+    >    language="en",
+    >    return_decision_process=True,
+    >)
+    >for result in results:
+    >    print(result)
+    >    print(result.analysis_explanation)
+    """
+    ENTITIES = [
+        "LOCATION",
+        "PERSON",
+        "ORGANIZATION",
+        # "MISCELLANEOUS"   # - There are no direct correlation with Presidio entities.
+    ]
+    DEFAULT_EXPLANATION = "Identified as {} by Flair's Named Entity Recognition"
+    CHECK_LABEL_GROUPS = [
+        ({"LOCATION"}, {"LOC", "LOCATION"}),
+        ({"PERSON"}, {"PER", "PERSON"}),
+        ({"ORGANIZATION"}, {"ORG"}),
+        # ({"MISCELLANEOUS"}, {"MISC"}), # Probably not PII
+    ]
+    MODEL_LANGUAGES = {
+        "en": "flair/ner-english-large"
+    }
+    PRESIDIO_EQUIVALENCES = {
+        "PER": "PERSON",
+        "LOC": "LOCATION",
+        "ORG": "ORGANIZATION",
+        # 'MISC': 'MISCELLANEOUS'   # - Probably not PII
+    }
+    def __init__(
+        self,
+        supported_language: str = "en",
+        supported_entities: Optional[List[str]] = None,
+        check_label_groups: Optional[Tuple[Set, Set]] = None,
+        model: SequenceTagger = None,
+        model_path: Optional[str] = None
+    ):
+        self.check_label_groups = (
+            check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
+        )
+        supported_entities = supported_entities if supported_entities else self.ENTITIES
+        if model and model_path:
+            raise ValueError("Only one of model or model_path should be provided.")
+        elif model and not model_path:
+            self.model = model
+        elif not model and model_path:
+            print(f"Loading model from {model_path}")
+            self.model = SequenceTagger.load(model_path)
+        else:
+            print(f"Loading model for language {supported_language}")
+            self.model = SequenceTagger.load(self.MODEL_LANGUAGES.get(supported_language))
+        super().__init__(
+            supported_entities=supported_entities,
+            supported_language=supported_language,
+            name="Flair Analytics",
+        )
+    def load(self) -> None:
+        """Load the model, not used. Model is loaded during initialization."""
+        pass
+    def get_supported_entities(self) -> List[str]:
+        """
+        Return supported entities by this model.
+        :return: List of the supported entities.
+        """
+        return self.supported_entities
+    # Class to use Flair with Presidio as an external recognizer.
+    def analyze(
+        self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts = None
+    ) -> List[RecognizerResult]:
+        """
+        Analyze text using Text Analytics.
+        :param text: The text for analysis.
+        :param entities: Not working properly for this recognizer.
+        :param nlp_artifacts: Not used by this recognizer.
+        :param language: Text language. Supported languages in MODEL_LANGUAGES
+        :return: The list of Presidio RecognizerResult constructed from the recognized
+            Flair detections.
+        """
+        results = []
+        sentences = Sentence(text)
+        self.model.predict(sentences)
+        # If there are no specific list of entities, we will look for all of it.
+        if not entities:
+            entities = self.supported_entities
+        for entity in entities:
+            if entity not in self.supported_entities:
+                continue
+            for ent in sentences.get_spans("ner"):
+                if not self.__check_label(
+                    entity, ent.labels[0].value, self.check_label_groups
+                ):
+                    continue
+                textual_explanation = self.DEFAULT_EXPLANATION.format(
+                    ent.labels[0].value
+                )
+                explanation = self.build_flair_explanation(
+                    round(ent.score, 2), textual_explanation
+                )
+                flair_result = self._convert_to_recognizer_result(ent, explanation)
+                results.append(flair_result)
+        return results
+    def _convert_to_recognizer_result(self, entity, explanation) -> RecognizerResult:
+        entity_type = self.PRESIDIO_EQUIVALENCES.get(entity.tag, entity.tag)
+        flair_score = round(entity.score, 2)
+        flair_results = RecognizerResult(
+            entity_type=entity_type,
+            start=entity.start_position,
+            end=entity.end_position,
+            score=flair_score,
+            analysis_explanation=explanation,
+        )
+        return flair_results
+    def build_flair_explanation(
+        self, original_score: float, explanation: str
+    ) -> AnalysisExplanation:
+        """
+        Create explanation for why this result was detected.
+        :param original_score: Score given by this recognizer
+        :param explanation: Explanation string
+        :return:
+        """
+        explanation = AnalysisExplanation(
+            recognizer=self.__class__.__name__,
+            original_score=original_score,
+            textual_explanation=explanation,
+        )
+        return explanation
+    @staticmethod
+    def __check_label(
+        entity: str, label: str, check_label_groups: Tuple[Set, Set]
+    ) -> bool:
+        return any(
+            [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
+        )

index.md ADDED Viewed

	@@ -0,0 +1,26 @@

+# Simple demo website for Presidio
+Here's a simple app, written in pure Python, to create a demo website for Presidio.
+The app is based on the [streamlit](https://streamlit.io/) package.
+A live version can be found here: https://huggingface.co/spaces/presidio/presidio_demo
+## Requirements
+1. Clone the repo and move to the `docs/samples/python/streamlit ` folder
+1. Install dependencies (preferably in a virtual environment)
+```sh
+pip install -r requirements
+```
+> Note: This would install additional packages such as `transformers` and `flair` which are not mandatory for using Presidio.
+2.
+3. *Optional*: Update the `analyzer_engine` and `anonymizer_engine` functions for your specific implementation (in `presidio_helpers.py`).
+3. Start the app:
+```sh
+streamlit run presidio_streamlit.py
+```
+## Output
+Output should be similar to this screenshot:
+![image](https://user-images.githubusercontent.com/3776619/232289541-d59992e1-52a4-44c1-b904-b22c72c02a5b.png)

openai_fake_data_generator.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import openai
+def set_openai_key(openai_key: str):
+    """Set the OpenAI API key.
+    :param openai_key: the open AI key (https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key)
+    """
+    openai.api_key = openai_key
+def call_completion_model(
+    prompt: str, model: str = "text-davinci-003", max_tokens: int = 512
+) -> str:
+    """Creates a request for the OpenAI Completion service and returns the response.
+    :param prompt: The prompt for the completion model
+    :param model: OpenAI model name
+    :param max_tokens: Model's max_tokens parameter
+    """
+    response = openai.Completion.create(
+        model=model, prompt=prompt, max_tokens=max_tokens
+    )
+    return response["choices"][0].text
+def create_prompt(anonymized_text: str) -> str:
+    """
+    Create the prompt with instructions to GPT-3.
+    :param anonymized_text: Text with placeholders instead of PII values, e.g. My name is <PERSON>.
+    """
+    prompt = f"""
+    Your role is to create synthetic text based on de-identified text with placeholders instead of Personally Identifiable Information (PII).
+    Replace the placeholders (e.g. ,<PERSON>, {{DATE}}, {{ip_address}}) with fake values.
+    Instructions:
+    a. Use completely random numbers, so every digit is drawn between 0 and 9.
+    b. Use realistic names that come from diverse genders, ethnicities and countries.
+    c. If there are no placeholders, return the text as is and provide an answer.
+    d. Keep the formatting as close to the original as possible.
+    e. If PII exists in the input, replace it with fake values in the output.
+    input: How do I change the limit on my credit card {{credit_card_number}}?
+    output: How do I change the limit on my credit card 2539 3519 2345 1555?
+    input: <PERSON> was the chief science officer at <ORGANIZATION>.
+    output: Katherine Buckjov was the chief science officer at NASA.
+    input: Cameroon lives in <LOCATION>.
+    output: Vladimir lives in Moscow.
+    input: {anonymized_text}
+    output:
+    """
+    return prompt

presidio_helpers.py ADDED Viewed

	@@ -0,0 +1,230 @@

+"""
+Helper methods for the Presidio Streamlit app
+"""
+from typing import List, Optional, Tuple
+import streamlit as st
+from presidio_analyzer import (
+    AnalyzerEngine,
+    RecognizerResult,
+    RecognizerRegistry,
+    PatternRecognizer,
+)
+from presidio_analyzer.nlp_engine import NlpEngine
+from presidio_anonymizer import AnonymizerEngine
+from presidio_anonymizer.entities import OperatorConfig
+from openai_fake_data_generator import (
+    set_openai_key,
+    call_completion_model,
+    create_prompt,
+)
+from presidio_nlp_engine_config import (
+    create_nlp_engine_with_spacy,
+    create_nlp_engine_with_flair,
+    create_nlp_engine_with_transformers,
+    create_nlp_engine_with_azure_text_analytics,
+)
+@st.cache_resource
+def nlp_engine_and_registry(
+    model_family: str,
+    model_path: str,
+    ta_key: Optional[str] = None,
+    ta_endpoint: Optional[str] = None,
+) -> Tuple[NlpEngine, RecognizerRegistry]:
+    """Create the NLP Engine instance based on the requested model.
+    :param model_family: Which model package to use for NER.
+    :param model_path: Which model to use for NER. E.g.,
+        "StanfordAIMI/stanford-deidentifier-base",
+        "obi/deid_roberta_i2b2",
+        "en_core_web_lg"
+    :param ta_key: Key to the Text Analytics endpoint (only if model_path = "Azure Text Analytics")
+    :param ta_endpoint: Endpoint of the Text Analytics instance (only if model_path = "Azure Text Analytics")
+    """
+    # Set up NLP Engine according to the model of choice
+    if "spaCy" in model_family:
+        return create_nlp_engine_with_spacy(model_path)
+    elif "flair" in model_family:
+        return create_nlp_engine_with_flair(model_path)
+    elif "HuggingFace" in model_family:
+        return create_nlp_engine_with_transformers(model_path)
+    elif "Azure Text Analytics" in model_family:
+        return create_nlp_engine_with_azure_text_analytics(ta_key, ta_endpoint)
+    else:
+        raise ValueError(f"Model family {model_family} not supported")
+@st.cache_resource
+def analyzer_engine(
+    model_family: str,
+    model_path: str,
+    ta_key: Optional[str] = None,
+    ta_endpoint: Optional[str] = None,
+) -> AnalyzerEngine:
+    """Create the NLP Engine instance based on the requested model.
+    :param model_family: Which model package to use for NER.
+    :param model_path: Which model to use for NER:
+        "StanfordAIMI/stanford-deidentifier-base",
+        "obi/deid_roberta_i2b2",
+        "en_core_web_lg"
+    :param ta_key: Key to the Text Analytics endpoint (only if model_path = "Azure Text Analytics")
+    :param ta_endpoint: Endpoint of the Text Analytics instance (only if model_path = "Azure Text Analytics")
+    """
+    nlp_engine, registry = nlp_engine_and_registry(
+        model_family, model_path, ta_key, ta_endpoint
+    )
+    analyzer = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry)
+    return analyzer
+@st.cache_resource
+def anonymizer_engine():
+    """Return AnonymizerEngine."""
+    return AnonymizerEngine()
+@st.cache_data
+def get_supported_entities(
+    model_family: str, model_path: str, ta_key: str, ta_endpoint: str
+):
+    """Return supported entities from the Analyzer Engine."""
+    return analyzer_engine(
+        model_family, model_path, ta_key, ta_endpoint
+    ).get_supported_entities() + ["GENERIC_PII"]
+@st.cache_data
+def analyze(
+    model_family: str, model_path: str, ta_key: str, ta_endpoint: str, **kwargs
+):
+    """Analyze input using Analyzer engine and input arguments (kwargs)."""
+    if "entities" not in kwargs or "All" in kwargs["entities"]:
+        kwargs["entities"] = None
+    if "deny_list" in kwargs and kwargs["deny_list"] is not None:
+        ad_hoc_recognizer = create_ad_hoc_deny_list_recognizer(kwargs["deny_list"])
+        kwargs["ad_hoc_recognizers"] = [ad_hoc_recognizer] if ad_hoc_recognizer else []
+        del kwargs["deny_list"]
+    return analyzer_engine(model_family, model_path, ta_key, ta_endpoint).analyze(
+        **kwargs
+    )
+def anonymize(
+    text: str,
+    operator: str,
+    analyze_results: List[RecognizerResult],
+    mask_char: Optional[str] = None,
+    number_of_chars: Optional[str] = None,
+    encrypt_key: Optional[str] = None,
+):
+    """Anonymize identified input using Presidio Anonymizer.
+    :param text: Full text
+    :param operator: Operator name
+    :param mask_char: Mask char (for mask operator)
+    :param number_of_chars: Number of characters to mask (for mask operator)
+    :param encrypt_key: Encryption key (for encrypt operator)
+    :param analyze_results: list of results from presidio analyzer engine
+    """
+    if operator == "mask":
+        operator_config = {
+            "type": "mask",
+            "masking_char": mask_char,
+            "chars_to_mask": number_of_chars,
+            "from_end": False,
+        }
+    # Define operator config
+    elif operator == "encrypt":
+        operator_config = {"key": encrypt_key}
+    elif operator == "highlight":
+        operator_config = {"lambda": lambda x: x}
+    else:
+        operator_config = None
+    # Change operator if needed as intermediate step
+    if operator == "highlight":
+        operator = "custom"
+    elif operator == "synthesize":
+        operator = "replace"
+    else:
+        operator = operator
+    res = anonymizer_engine().anonymize(
+        text,
+        analyze_results,
+        operators={"DEFAULT": OperatorConfig(operator, operator_config)},
+    )
+    return res
+def annotate(text: str, analyze_results: List[RecognizerResult]):
+    """Highlight the identified PII entities on the original text
+    :param text: Full text
+    :param analyze_results: list of results from presidio analyzer engine
+    """
+    tokens = []
+    # Use the anonymizer to resolve overlaps
+    results = anonymize(
+        text=text,
+        operator="highlight",
+        analyze_results=analyze_results,
+    )
+    # sort by start index
+    results = sorted(results.items, key=lambda x: x.start)
+    for i, res in enumerate(results):
+        if i == 0:
+            tokens.append(text[: res.start])
+        # append entity text and entity type
+        tokens.append((text[res.start : res.end], res.entity_type))
+        # if another entity coming i.e. we're not at the last results element, add text up to next entity
+        if i != len(results) - 1:
+            tokens.append(text[res.end : results[i + 1].start])
+        # if no more entities coming, add all remaining text
+        else:
+            tokens.append(text[res.end :])
+    return tokens
+def create_fake_data(
+    text: str,
+    analyze_results: List[RecognizerResult],
+    openai_key: str,
+    openai_model_name: str,
+):
+    """Creates a synthetic version of the text using OpenAI APIs"""
+    if not openai_key:
+        return "Please provide your OpenAI key"
+    results = anonymize(text=text, operator="replace", analyze_results=analyze_results)
+    set_openai_key(openai_key)
+    prompt = create_prompt(results.text)
+    fake = call_openai_api(prompt, openai_model_name)
+    return fake
+@st.cache_data
+def call_openai_api(prompt: str, openai_model_name: str) -> str:
+    fake_data = call_completion_model(prompt, model=openai_model_name)
+    return fake_data
+def create_ad_hoc_deny_list_recognizer(
+    deny_list=Optional[List[str]],
+) -> Optional[PatternRecognizer]:
+    if not deny_list:
+        return None
+    deny_list_recognizer = PatternRecognizer(supported_entity="GENERIC_PII", deny_list=deny_list)
+    print(deny_list_recognizer.patterns)
+    return deny_list_recognizer

presidio_nlp_engine_config.py ADDED Viewed

	@@ -0,0 +1,135 @@

+from typing import Tuple
+import spacy
+from presidio_analyzer import RecognizerRegistry
+from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider
+def create_nlp_engine_with_spacy(
+    model_path: str,
+) -> Tuple[NlpEngine, RecognizerRegistry]:
+    """
+    Instantiate an NlpEngine with a spaCy model
+    :param model_path: spaCy model path.
+    """
+    registry = RecognizerRegistry()
+    registry.load_predefined_recognizers()
+    if not spacy.util.is_package(model_path):
+        spacy.cli.download(model_path)
+    nlp_configuration = {
+        "nlp_engine_name": "spacy",
+        "models": [{"lang_code": "en", "model_name": model_path}],
+    }
+    nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
+    return nlp_engine, registry
+def create_nlp_engine_with_transformers(
+    model_path: str,
+) -> Tuple[NlpEngine, RecognizerRegistry]:
+    """
+    Instantiate an NlpEngine with a TransformersRecognizer and a small spaCy model.
+    The TransformersRecognizer would return results from Transformers models, the spaCy model
+    would return NlpArtifacts such as POS and lemmas.
+    :param model_path: HuggingFace model path.
+    """
+    from transformers_rec import (
+        STANFORD_COFIGURATION,
+        BERT_DEID_CONFIGURATION,
+        TransformersRecognizer,
+    )
+    registry = RecognizerRegistry()
+    registry.load_predefined_recognizers()
+    if not spacy.util.is_package("en_core_web_sm"):
+        spacy.cli.download("en_core_web_sm")
+    # Using a small spaCy model + a HF NER model
+    transformers_recognizer = TransformersRecognizer(model_path=model_path)
+    if model_path == "StanfordAIMI/stanford-deidentifier-base":
+        transformers_recognizer.load_transformer(**STANFORD_COFIGURATION)
+    elif model_path == "obi/deid_roberta_i2b2":
+        transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
+    else:
+        print(f"Warning: Model has no configuration, loading default.")
+        transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
+    # Use small spaCy model, no need for both spacy and HF models
+    # The transformers model is used here as a recognizer, not as an NlpEngine
+    nlp_configuration = {
+        "nlp_engine_name": "spacy",
+        "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
+    }
+    registry.add_recognizer(transformers_recognizer)
+    registry.remove_recognizer("SpacyRecognizer")
+    nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
+    return nlp_engine, registry
+def create_nlp_engine_with_flair(
+    model_path: str,
+) -> Tuple[NlpEngine, RecognizerRegistry]:
+    """
+    Instantiate an NlpEngine with a FlairRecognizer and a small spaCy model.
+    The FlairRecognizer would return results from Flair models, the spaCy model
+    would return NlpArtifacts such as POS and lemmas.
+    :param model_path: Flair model path.
+    """
+    from flair_recognizer import FlairRecognizer
+    registry = RecognizerRegistry()
+    registry.load_predefined_recognizers()
+    if not spacy.util.is_package("en_core_web_sm"):
+        spacy.cli.download("en_core_web_sm")
+    # Using a small spaCy model + a Flair NER model
+    flair_recognizer = FlairRecognizer(model_path=model_path)
+    nlp_configuration = {
+        "nlp_engine_name": "spacy",
+        "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
+    }
+    registry.add_recognizer(flair_recognizer)
+    registry.remove_recognizer("SpacyRecognizer")
+    nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
+    return nlp_engine, registry
+def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str):
+    """
+    Instantiate an NlpEngine with a TextAnalyticsWrapper and a small spaCy model.
+    The TextAnalyticsWrapper would return results from calling Azure Text Analytics PII, the spaCy model
+    would return NlpArtifacts such as POS and lemmas.
+    :param ta_key: Azure Text Analytics key.
+    :param ta_endpoint: Azure Text Analytics endpoint.
+    """
+    from text_analytics_wrapper import TextAnalyticsWrapper
+    if not ta_key or not ta_endpoint:
+        raise RuntimeError("Please fill in the Text Analytics endpoint details")
+    registry = RecognizerRegistry()
+    registry.load_predefined_recognizers()
+    ta_recognizer = TextAnalyticsWrapper(ta_endpoint=ta_endpoint, ta_key=ta_key)
+    nlp_configuration = {
+        "nlp_engine_name": "spacy",
+        "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
+    }
+    nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
+    registry.add_recognizer(ta_recognizer)
+    registry.remove_recognizer("SpacyRecognizer")
+    return nlp_engine, registry

presidio_streamlit.py ADDED Viewed

	@@ -0,0 +1,280 @@

+"""Streamlit app for Presidio."""
+import os
+import pandas as pd
+import streamlit as st
+import streamlit.components.v1 as components
+from annotated_text import annotated_text
+from streamlit_tags import st_tags
+from presidio_helpers import (
+    get_supported_entities,
+    analyze,
+    anonymize,
+    annotate,
+    create_fake_data,
+    analyzer_engine,
+    nlp_engine_and_registry,
+)
+st.set_page_config(page_title="Presidio demo", layout="wide")
+# Sidebar
+st.sidebar.header(
+    """
+PII De-Identification with Microsoft Presidio
+"""
+)
+st.sidebar.info(
+    "Presidio is an open source customizable framework for PII detection and de-identification\n"
+    "[Code](https://aka.ms/presidio) | "
+    "[Tutorial](https://microsoft.github.io/presidio/tutorial/) | "
+    "[Installation](https://microsoft.github.io/presidio/installation/) | "
+    "[FAQ](https://microsoft.github.io/presidio/faq/)",
+    icon="ℹ️",
+)
+st.sidebar.markdown(
+    "[![Pypi Downloads](https://img.shields.io/pypi/dm/presidio-analyzer.svg)](https://img.shields.io/pypi/dm/presidio-analyzer.svg)"  # noqa
+    "[![MIT license](https://img.shields.io/badge/license-MIT-brightgreen.svg)](https://opensource.org/licenses/MIT)"
+    "![GitHub Repo stars](https://img.shields.io/github/stars/microsoft/presidio?style=social)"
+)
+model_help_text = """
+    Select which Named Entity Recognition (NER) model to use for PII detection, in parallel to rule-based recognizers.
+    Presidio supports multiple NER packages off-the-shelf, such as spaCy, Huggingface, Stanza and Flair,
+    as well as service such as Azure Text Analytics PII.
+    """
+st_ta_key = st_ta_endpoint = ""
+st_model = "en_core_web_lg"
+st_model_package = st.sidebar.selectbox(
+    "NER model package",
+    ["spaCy", "flair", "HuggingFace", "Azure Text Analytics"],
+    index=2,
+    help="Select the NLP package to use for PII detection",
+)
+if st_model_package == "spaCy":
+    st_model = st.sidebar.selectbox(
+        "NER model for PII detection",
+        ["en_core_web_lg", "en_core_web_trf", "Other"],
+        help=model_help_text,
+    )
+elif st_model_package == "HuggingFace":
+    st_model = st.sidebar.selectbox(
+        "NER model for PII detection",
+        ["obi/deid_roberta_i2b2", "StanfordAIMI/stanford-deidentifier-base", "Other"],
+        help=model_help_text,
+    )
+elif st_model_package == "flair":
+    st_model = st.sidebar.selectbox(
+        "NER model for PII detection",
+        ["flair/ner-english-large", "Other"],
+        help=model_help_text,
+    )
+elif st_model_package == "Azure Text Analytics":
+    st_model = st.sidebar.selectbox(
+        "NER model for PII detection",
+        ["Azure Text Analytics PII"],
+        help=model_help_text,
+    )
+    st_ta_key = st.sidebar.text_input("Text Analytics Key", type="password")
+    st_ta_endpoint = st.sidebar.text_input("Text Analytics Endpoint")
+if st_model == "Other":
+    st_model = st.sidebar.text_input(
+        f"NER model name for package {st_model_package}", value=""
+    )
+st.sidebar.warning("Note: Models might take some time to download. ")
+analyzer_params = (st_model_package, st_model, st_ta_key, st_ta_endpoint)
+st_operator = st.sidebar.selectbox(
+    "De-identification approach",
+    ["redact", "replace", "synthesize", "highlight", "mask", "hash", "encrypt"],
+    index=1,
+    help="""
+    Select which manipulation to the text is requested after PII has been identified.\n
+    - Redact: Completely remove the PII text\n
+    - Replace: Replace the PII text with a constant, e.g. <PERSON>\n
+    - Synthesize: Replace with fake values (requires an OpenAI key)\n
+    - Highlight: Shows the original text with PII highlighted in colors\n
+    - Mask: Replaces a requested number of characters with an asterisk (or other mask character)\n
+    - Hash: Replaces with the hash of the PII string\n
+    - Encrypt: Replaces with an AES encryption of the PII string, allowing the process to be reversed
+         """,
+)
+st_mask_char = "*"
+st_number_of_chars = 15
+st_encrypt_key = "WmZq4t7w!z%C&F)J"
+st_openai_key = ""
+st_openai_model = "text-davinci-003"
+if st_operator == "mask":
+    st_number_of_chars = st.sidebar.number_input(
+        "number of chars", value=st_number_of_chars, min_value=0, max_value=100
+    )
+    st_mask_char = st.sidebar.text_input(
+        "Mask character", value=st_mask_char, max_chars=1
+    )
+elif st_operator == "encrypt":
+    st_encrypt_key = st.sidebar.text_input("AES key", value=st_encrypt_key)
+elif st_operator == "synthesize":
+    st_openai_key = st.sidebar.text_input(
+        "OPENAI_KEY",
+        value=os.getenv("OPENAI_KEY", default=""),
+        help="See https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key for more info.",
+        type="password",
+    )
+    st_openai_model = st.sidebar.text_input(
+        "OpenAI model for text synthesis",
+        value=st_openai_model,
+        help="See more here: https://platform.openai.com/docs/models/",
+    )
+st_threshold = st.sidebar.slider(
+    label="Acceptance threshold",
+    min_value=0.0,
+    max_value=1.0,
+    value=0.35,
+    help="Define the threshold for accepting a detection as PII. See more here: ",
+)
+st_return_decision_process = st.sidebar.checkbox(
+    "Add analysis explanations to findings",
+    value=False,
+    help="Add the decision process to the output table. "
+    "More information can be found here: https://microsoft.github.io/presidio/analyzer/decision_process/",
+)
+# Allow and deny lists
+st_deny_allow_expander = st.sidebar.expander(
+    "Allow and deny lists",
+    expanded=False,
+)
+with st_deny_allow_expander:
+    st_allow_list = st_tags(label="Add words to the allow list", text="Enter word and press enter.")
+    st.caption('Allow lists contain words that are not considered PII, but are detected as such.')
+    st_deny_list = st_tags(label="Add words to the deny list", text="Enter word and press enter.")
+    st.caption("Deny lists contain words that are considered PII, but are not detected as such.")
+# Main panel
+analyzer_load_state = st.info("Starting Presidio analyzer...")
+nlp_engine, registry = nlp_engine_and_registry(*analyzer_params)
+analyzer = analyzer_engine(*analyzer_params)
+analyzer_load_state.empty()
+# Choose entities
+st_entities_expander = st.sidebar.expander("Choose entities to look for")
+st_entities = st_entities_expander.multiselect(
+    label="Which entities to look for?",
+    options=get_supported_entities(*analyzer_params),
+    default=list(get_supported_entities(*analyzer_params)),
+    help="Limit the list of PII entities detected. "
+    "This list is dynamic and based on the NER model and registered recognizers. "
+    "More information can be found here: https://microsoft.github.io/presidio/analyzer/adding_recognizers/",
+)
+# Read default text
+with open("demo_text.txt") as f:
+    demo_text = f.readlines()
+# Create two columns for before and after
+col1, col2 = st.columns(2)
+# Before:
+col1.subheader("Input string:")
+st_text = col1.text_area(
+    label="Enter text",
+    value="".join(demo_text),
+    height=400,
+)
+st_analyze_results = analyze(
+    *analyzer_params,
+    text=st_text,
+    entities=st_entities,
+    language="en",
+    score_threshold=st_threshold,
+    return_decision_process=st_return_decision_process,
+    allow_list=st_allow_list,
+    deny_list=st_deny_list,
+)
+# After
+if st_operator not in ("highlight", "synthesize"):
+    with col2:
+        st.subheader(f"Output")
+        st_anonymize_results = anonymize(
+            text=st_text,
+            operator=st_operator,
+            mask_char=st_mask_char,
+            number_of_chars=st_number_of_chars,
+            encrypt_key=st_encrypt_key,
+            analyze_results=st_analyze_results,
+        )
+        st.text_area(label="De-identified", value=st_anonymize_results.text, height=400)
+elif st_operator == "synthesize":
+    with col2:
+        st.subheader(f"OpenAI Generated output")
+        fake_data = create_fake_data(
+            st_text,
+            st_analyze_results,
+            openai_key=st_openai_key,
+            openai_model_name=st_openai_model,
+        )
+        st.text_area(label="Synthetic data", value=fake_data, height=400)
+else:
+    st.subheader("Highlighted")
+    annotated_tokens = annotate(text=st_text, analyze_results=st_analyze_results)
+    # annotated_tokens
+    annotated_text(*annotated_tokens)
+# table result
+st.subheader(
+    "Findings" if not st_return_decision_process else "Findings with decision factors"
+)
+if st_analyze_results:
+    df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
+    df["text"] = [st_text[res.start : res.end] for res in st_analyze_results]
+    df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
+        {
+            "entity_type": "Entity type",
+            "text": "Text",
+            "start": "Start",
+            "end": "End",
+            "score": "Confidence",
+        },
+        axis=1,
+    )
+    df_subset["Text"] = [st_text[res.start : res.end] for res in st_analyze_results]
+    if st_return_decision_process:
+        analysis_explanation_df = pd.DataFrame.from_records(
+            [r.analysis_explanation.to_dict() for r in st_analyze_results]
+        )
+        df_subset = pd.concat([df_subset, analysis_explanation_df], axis=1)
+    st.dataframe(df_subset.reset_index(drop=True), use_container_width=True)
+else:
+    st.text("No findings")
+components.html(
+    """
+    <script type="text/javascript">
+    (function(c,l,a,r,i,t,y){
+        c[a]=c[a]||function(){(c[a].q=c[a].q||[]).push(arguments)};
+        t=l.createElement(r);t.async=1;t.src="https://www.clarity.ms/tag/"+i;
+        y=l.getElementsByTagName(r)[0];y.parentNode.insertBefore(t,y);
+    })(window, document, "clarity", "script", "h7f8bp42n8");
+    </script>
+    """
+)

requirements.txt CHANGED Viewed

@@ -1,6 +1,12 @@
-pandas
-streamlit
-presidio-anonymizer
 presidio-analyzer
 torch
-transformers

 presidio-analyzer
+presidio-anonymizer
+streamlit
+streamlit-tags
+pandas
+st-annotated-text
 torch
+transformers
+flair
+openai
+spacy
+azure-ai-textanalytics

text_analytics_wrapper.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import os
+from typing import List, Optional
+import dotenv
+from azure.ai.textanalytics import TextAnalyticsClient
+from azure.core.credentials import AzureKeyCredential
+from presidio_analyzer import EntityRecognizer, RecognizerResult, AnalysisExplanation
+from presidio_analyzer.nlp_engine import NlpArtifacts
+class TextAnalyticsWrapper(EntityRecognizer):
+    from azure.ai.textanalytics._models import PiiEntityCategory
+    TA_SUPPORTED_ENTITIES = [r.value for r in PiiEntityCategory]
+    def __init__(
+        self,
+        supported_entities: Optional[List[str]] = None,
+        supported_language: str = "en",
+        ta_client: Optional[TextAnalyticsClient] = None,
+        ta_key: Optional[str] = None,
+        ta_endpoint: Optional[str] = None,
+    ):
+        """
+        Wrapper for the Azure Text Analytics client
+        :param ta_client: object of type TextAnalyticsClient
+        :param ta_key: Azure cognitive Services for Language key
+        :param ta_endpoint: Azure cognitive Services for Language endpoint
+        """
+        if not supported_entities:
+            supported_entities = self.TA_SUPPORTED_ENTITIES
+        super().__init__(
+            supported_entities=supported_entities,
+            supported_language=supported_language,
+            name="Azure Text Analytics PII",
+        )
+        self.ta_key = ta_key
+        self.ta_endpoint = ta_endpoint
+        if not ta_client:
+            ta_client = self.__authenticate_client(ta_key, ta_endpoint)
+        self.ta_client = ta_client
+    @staticmethod
+    def __authenticate_client(key: str, endpoint: str):
+        ta_credential = AzureKeyCredential(key)
+        text_analytics_client = TextAnalyticsClient(
+            endpoint=endpoint, credential=ta_credential
+        )
+        return text_analytics_client
+    def analyze(
+        self, text: str, entities: List[str] = None, nlp_artifacts: NlpArtifacts = None
+    ) -> List[RecognizerResult]:
+        if not entities:
+            entities = []
+        response = self.ta_client.recognize_pii_entities(
+            [text], language=self.supported_language
+        )
+        results = [doc for doc in response if not doc.is_error]
+        recognizer_results = []
+        for res in results:
+            for entity in res.entities:
+                if entity.category not in self.supported_entities:
+                    continue
+                analysis_explanation = TextAnalyticsWrapper._build_explanation(
+                    original_score=entity.confidence_score,
+                    entity_type=entity.category,
+                )
+                recognizer_results.append(
+                    RecognizerResult(
+                        entity_type=entity.category,
+                        start=entity.offset,
+                        end=entity.offset + len(entity.text),
+                        score=entity.confidence_score,
+                        analysis_explanation=analysis_explanation,
+                    )
+                )
+        return recognizer_results
+    @staticmethod
+    def _build_explanation(
+        original_score: float, entity_type: str
+    ) -> AnalysisExplanation:
+        explanation = AnalysisExplanation(
+            recognizer=TextAnalyticsWrapper.__class__.__name__,
+            original_score=original_score,
+            textual_explanation=f"Identified as {entity_type} by Text Analytics",
+        )
+        return explanation
+    def load(self) -> None:
+        pass
+if __name__ == "__main__":
+    import presidio_helpers
+    dotenv.load_dotenv()
+    text = """
+    Here are a few example sentences we currently support:
+    Hello, my name is David Johnson and I live in Maine.
+    My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.
+    On September 18 I visited microsoft.com and sent an email to [email protected],  from the IP 192.168.0.1.
+    My passport: 191280342 and my phone number: (212) 555-1234.
+    This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544?
+    Kate's social security number is 078-05-1126.  Her driver license? it is 1234567A.
+    """
+    analyzer = presidio_helpers.analyzer_engine(
+        model_path="Azure Text Analytics PII",
+        ta_key=os.environ["TA_KEY"],
+        ta_endpoint=os.environ["TA_ENDPOINT"],
+    )
+    analyzer.analyze(text=text, language="en")