omri374 commited on
Commit
41e004f
·
1 Parent(s): b75d1f1

Upload 12 files

Browse files
azure_ai_language_wrapper.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List, Optional
3
+ import logging
4
+ import dotenv
5
+ from azure.ai.textanalytics import TextAnalyticsClient
6
+ from azure.core.credentials import AzureKeyCredential
7
+
8
+ from presidio_analyzer import EntityRecognizer, RecognizerResult, AnalysisExplanation
9
+ from presidio_analyzer.nlp_engine import NlpArtifacts
10
+
11
+ logger = logging.getLogger("presidio-streamlit")
12
+
13
+
14
+ class AzureAIServiceWrapper(EntityRecognizer):
15
+ from azure.ai.textanalytics._models import PiiEntityCategory
16
+
17
+ TA_SUPPORTED_ENTITIES = [r.value for r in PiiEntityCategory]
18
+
19
+ def __init__(
20
+ self,
21
+ supported_entities: Optional[List[str]] = None,
22
+ supported_language: str = "en",
23
+ ta_client: Optional[TextAnalyticsClient] = None,
24
+ ta_key: Optional[str] = None,
25
+ ta_endpoint: Optional[str] = None,
26
+ ):
27
+ """
28
+ Wrapper for the Azure Text Analytics client
29
+ :param ta_client: object of type TextAnalyticsClient
30
+ :param ta_key: Azure cognitive Services for Language key
31
+ :param ta_endpoint: Azure cognitive Services for Language endpoint
32
+ """
33
+
34
+ if not supported_entities:
35
+ supported_entities = self.TA_SUPPORTED_ENTITIES
36
+
37
+ super().__init__(
38
+ supported_entities=supported_entities,
39
+ supported_language=supported_language,
40
+ name="Azure AI Language PII",
41
+ )
42
+
43
+ self.ta_key = ta_key
44
+ self.ta_endpoint = ta_endpoint
45
+
46
+ if not ta_client:
47
+ ta_client = self.__authenticate_client(ta_key, ta_endpoint)
48
+ self.ta_client = ta_client
49
+
50
+ @staticmethod
51
+ def __authenticate_client(key: str, endpoint: str):
52
+ ta_credential = AzureKeyCredential(key)
53
+ text_analytics_client = TextAnalyticsClient(
54
+ endpoint=endpoint, credential=ta_credential
55
+ )
56
+ return text_analytics_client
57
+
58
+ def analyze(
59
+ self, text: str, entities: List[str] = None, nlp_artifacts: NlpArtifacts = None
60
+ ) -> List[RecognizerResult]:
61
+ if not entities:
62
+ entities = []
63
+ response = self.ta_client.recognize_pii_entities(
64
+ [text], language=self.supported_language
65
+ )
66
+ results = [doc for doc in response if not doc.is_error]
67
+ recognizer_results = []
68
+ for res in results:
69
+ for entity in res.entities:
70
+ if entity.category not in self.supported_entities:
71
+ continue
72
+ analysis_explanation = AzureAIServiceWrapper._build_explanation(
73
+ original_score=entity.confidence_score,
74
+ entity_type=entity.category,
75
+ )
76
+ recognizer_results.append(
77
+ RecognizerResult(
78
+ entity_type=entity.category,
79
+ start=entity.offset,
80
+ end=entity.offset + len(entity.text),
81
+ score=entity.confidence_score,
82
+ analysis_explanation=analysis_explanation,
83
+ )
84
+ )
85
+
86
+ return recognizer_results
87
+
88
+ @staticmethod
89
+ def _build_explanation(
90
+ original_score: float, entity_type: str
91
+ ) -> AnalysisExplanation:
92
+ explanation = AnalysisExplanation(
93
+ recognizer=AzureAIServiceWrapper.__class__.__name__,
94
+ original_score=original_score,
95
+ textual_explanation=f"Identified as {entity_type} by Text Analytics",
96
+ )
97
+ return explanation
98
+
99
+ def load(self) -> None:
100
+ pass
101
+
102
+
103
+ if __name__ == "__main__":
104
+ import presidio_helpers
105
+
106
+ dotenv.load_dotenv()
107
+ text = """
108
+ Here are a few example sentences we currently support:
109
+
110
+ Hello, my name is David Johnson and I live in Maine.
111
+ My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.
112
+
113
+ On September 18 I visited microsoft.com and sent an email to [email protected], from the IP 192.168.0.1.
114
+
115
+ My passport: 191280342 and my phone number: (212) 555-1234.
116
+
117
+ This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544?
118
+
119
+ Kate's social security number is 078-05-1126. Her driver license? it is 1234567A.
120
+ """
121
+ analyzer = presidio_helpers.analyzer_engine(
122
+ model_path="Azure Text Analytics PII",
123
+ ta_key=os.environ["TA_KEY"],
124
+ ta_endpoint=os.environ["TA_ENDPOINT"],
125
+ )
126
+ analyzer.analyze(text=text, language="en")
flair_recognizer.py CHANGED
@@ -59,9 +59,7 @@ class FlairRecognizer(EntityRecognizer):
59
  # ({"MISCELLANEOUS"}, {"MISC"}), # Probably not PII
60
  ]
61
 
62
- MODEL_LANGUAGES = {
63
- "en": "flair/ner-english-large"
64
- }
65
 
66
  PRESIDIO_EQUIVALENCES = {
67
  "PER": "PERSON",
@@ -76,7 +74,7 @@ class FlairRecognizer(EntityRecognizer):
76
  supported_entities: Optional[List[str]] = None,
77
  check_label_groups: Optional[Tuple[Set, Set]] = None,
78
  model: SequenceTagger = None,
79
- model_path: Optional[str] = None
80
  ):
81
  self.check_label_groups = (
82
  check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
@@ -93,7 +91,9 @@ class FlairRecognizer(EntityRecognizer):
93
  self.model = SequenceTagger.load(model_path)
94
  else:
95
  print(f"Loading model for language {supported_language}")
96
- self.model = SequenceTagger.load(self.MODEL_LANGUAGES.get(supported_language))
 
 
97
 
98
  super().__init__(
99
  supported_entities=supported_entities,
 
59
  # ({"MISCELLANEOUS"}, {"MISC"}), # Probably not PII
60
  ]
61
 
62
+ MODEL_LANGUAGES = {"en": "flair/ner-english-large"}
 
 
63
 
64
  PRESIDIO_EQUIVALENCES = {
65
  "PER": "PERSON",
 
74
  supported_entities: Optional[List[str]] = None,
75
  check_label_groups: Optional[Tuple[Set, Set]] = None,
76
  model: SequenceTagger = None,
77
+ model_path: Optional[str] = None,
78
  ):
79
  self.check_label_groups = (
80
  check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
 
91
  self.model = SequenceTagger.load(model_path)
92
  else:
93
  print(f"Loading model for language {supported_language}")
94
+ self.model = SequenceTagger.load(
95
+ self.MODEL_LANGUAGES.get(supported_language)
96
+ )
97
 
98
  super().__init__(
99
  supported_entities=supported_entities,
flair_test.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import generic wrappers
2
+ from transformers import AutoModel, AutoTokenizer
3
+
4
+
5
+ if __name__ == "__main__":
6
+ from flair.data import Sentence
7
+ from flair.models import SequenceTagger
8
+
9
+ # load tagger
10
+ tagger = SequenceTagger.load("flair/ner-english-large")
11
+
12
+ # make example sentence
13
+ sentence = Sentence("George Washington went to Washington")
14
+
15
+ # predict NER tags
16
+ tagger.predict(sentence)
17
+
18
+ # print sentence
19
+ print(sentence)
20
+
21
+ # print predicted NER spans
22
+ print("The following NER tags are found:")
23
+ # iterate over entities and print
24
+ for entity in sentence.get_spans("ner"):
25
+ print(entity)
index.md CHANGED
@@ -5,22 +5,32 @@ The app is based on the [streamlit](https://streamlit.io/) package.
5
  A live version can be found here: https://huggingface.co/spaces/presidio/presidio_demo
6
 
7
  ## Requirements
8
- 1. Clone the repo and move to the `docs/samples/python/streamlit ` folder
9
- 1. Install dependencies (preferably in a virtual environment)
10
 
11
  ```sh
12
  pip install -r requirements
13
  ```
14
  > Note: This would install additional packages such as `transformers` and `flair` which are not mandatory for using Presidio.
15
 
16
- 2.
17
  3. *Optional*: Update the `analyzer_engine` and `anonymizer_engine` functions for your specific implementation (in `presidio_helpers.py`).
18
- 3. Start the app:
19
 
20
  ```sh
21
  streamlit run presidio_streamlit.py
22
  ```
23
 
 
 
 
 
 
 
 
 
 
 
 
24
  ## Output
25
  Output should be similar to this screenshot:
26
- ![image](https://user-images.githubusercontent.com/3776619/232289541-d59992e1-52a4-44c1-b904-b22c72c02a5b.png)
 
5
  A live version can be found here: https://huggingface.co/spaces/presidio/presidio_demo
6
 
7
  ## Requirements
8
+ 1. Clone the repo and move to the `docs/samples/python/streamlit` folder
9
+ 2. Install dependencies (preferably in a virtual environment)
10
 
11
  ```sh
12
  pip install -r requirements
13
  ```
14
  > Note: This would install additional packages such as `transformers` and `flair` which are not mandatory for using Presidio.
15
 
 
16
  3. *Optional*: Update the `analyzer_engine` and `anonymizer_engine` functions for your specific implementation (in `presidio_helpers.py`).
17
+ 4. Start the app:
18
 
19
  ```sh
20
  streamlit run presidio_streamlit.py
21
  ```
22
 
23
+ 5. Consider adding an `.env` file with the following environment variables, for further customizability:
24
+ ```sh
25
+ TA_KEY=YOUR_TEXT_ANALYTICS_KEY
26
+ TA_ENDPOINT=YOUR_TEXT_ANALYTICS_ENDPOINT
27
+ OPENAI_TYPE="Azure" #or "openai"
28
+ OPENAI_KEY=YOUR_OPENAI_KEY
29
+ OPENAI_API_VERSION = "2023-05-15"
30
+ AZURE_OPENAI_ENDPOINT=YOUR_AZURE_OPENAI_AZURE_OPENAI_ENDPOINT
31
+ AZURE_OPENAI_DEPLOYMENT=text-davinci-003
32
+ ALLOW_OTHER_MODELS=true #true if the user could download new models
33
+ ```
34
  ## Output
35
  Output should be similar to this screenshot:
36
+ ![image](https://github.com/microsoft/presidio/assets/3776619/7d0eadf1-e750-4747-8b59-8203aa43cac8)
openai_fake_data_generator.py CHANGED
@@ -39,7 +39,10 @@ def call_completion_model(
39
  """
40
  if deployment_id:
41
  response = openai.Completion.create(
42
- deployment_id=deployment_id, model=model, prompt=prompt, max_tokens=max_tokens
 
 
 
43
  )
44
  else:
45
  response = openai.Completion.create(
@@ -64,17 +67,18 @@ def create_prompt(anonymized_text: str) -> str:
64
 
65
  a. Use completely random numbers, so every digit is drawn between 0 and 9.
66
  b. Use realistic names that come from diverse genders, ethnicities and countries.
67
- c. If there are no placeholders, return the text as is and provide an answer.
68
  d. Keep the formatting as close to the original as possible.
69
  e. If PII exists in the input, replace it with fake values in the output.
 
70
 
71
- input: How do I change the limit on my credit card {{credit_card_number}}?
72
  output: How do I change the limit on my credit card 2539 3519 2345 1555?
73
- input: <PERSON> was the chief science officer at <ORGANIZATION>.
74
  output: Katherine Buckjov was the chief science officer at NASA.
75
- input: Cameroon lives in <LOCATION>.
76
  output: Vladimir lives in Moscow.
77
- input: {anonymized_text}
78
- output:
79
- """
80
  return prompt
 
39
  """
40
  if deployment_id:
41
  response = openai.Completion.create(
42
+ deployment_id=deployment_id,
43
+ model=model,
44
+ prompt=prompt,
45
+ max_tokens=max_tokens,
46
  )
47
  else:
48
  response = openai.Completion.create(
 
67
 
68
  a. Use completely random numbers, so every digit is drawn between 0 and 9.
69
  b. Use realistic names that come from diverse genders, ethnicities and countries.
70
+ c. If there are no placeholders, return the text as is.
71
  d. Keep the formatting as close to the original as possible.
72
  e. If PII exists in the input, replace it with fake values in the output.
73
+ f. Remove whitespace before and after the generated text
74
 
75
+ input: [[TEXT STARTS]] How do I change the limit on my credit card {{credit_card_number}}?[[TEXT ENDS]]
76
  output: How do I change the limit on my credit card 2539 3519 2345 1555?
77
+ input: [[TEXT STARTS]]<PERSON> was the chief science officer at <ORGANIZATION>.[[TEXT ENDS]]
78
  output: Katherine Buckjov was the chief science officer at NASA.
79
+ input: [[TEXT STARTS]]Cameroon lives in <LOCATION>.[[TEXT ENDS]]
80
  output: Vladimir lives in Moscow.
81
+
82
+ input: [[TEXT STARTS]]{anonymized_text}[[TEXT ENDS]]
83
+ output:"""
84
  return prompt
presidio_helpers.py CHANGED
@@ -25,7 +25,8 @@ from presidio_nlp_engine_config import (
25
  create_nlp_engine_with_spacy,
26
  create_nlp_engine_with_flair,
27
  create_nlp_engine_with_transformers,
28
- create_nlp_engine_with_azure_text_analytics,
 
29
  )
30
 
31
  logger = logging.getLogger("presidio-streamlit")
@@ -49,14 +50,16 @@ def nlp_engine_and_registry(
49
  """
50
 
51
  # Set up NLP Engine according to the model of choice
52
- if "spaCy" in model_family:
53
  return create_nlp_engine_with_spacy(model_path)
54
- elif "flair" in model_family:
 
 
55
  return create_nlp_engine_with_flair(model_path)
56
- elif "HuggingFace" in model_family:
57
  return create_nlp_engine_with_transformers(model_path)
58
- elif "Azure Text Analytics" in model_family:
59
- return create_nlp_engine_with_azure_text_analytics(ta_key, ta_endpoint)
60
  else:
61
  raise ValueError(f"Model family {model_family} not supported")
62
 
 
25
  create_nlp_engine_with_spacy,
26
  create_nlp_engine_with_flair,
27
  create_nlp_engine_with_transformers,
28
+ create_nlp_engine_with_azure_ai_language,
29
+ create_nlp_engine_with_stanza,
30
  )
31
 
32
  logger = logging.getLogger("presidio-streamlit")
 
50
  """
51
 
52
  # Set up NLP Engine according to the model of choice
53
+ if "spacy" in model_family.lower():
54
  return create_nlp_engine_with_spacy(model_path)
55
+ if "stanza" in model_family.lower():
56
+ return create_nlp_engine_with_stanza(model_path)
57
+ elif "flair" in model_family.lower():
58
  return create_nlp_engine_with_flair(model_path)
59
+ elif "huggingface" in model_family.lower():
60
  return create_nlp_engine_with_transformers(model_path)
61
+ elif "azure ai language" in model_family.lower():
62
+ return create_nlp_engine_with_azure_ai_language(ta_key, ta_endpoint)
63
  else:
64
  raise ValueError(f"Model family {model_family} not supported")
65
 
presidio_nlp_engine_config.py CHANGED
@@ -1,8 +1,12 @@
1
- from typing import Tuple
2
  import logging
 
 
3
  import spacy
4
  from presidio_analyzer import RecognizerRegistry
5
- from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider
 
 
 
6
 
7
  logger = logging.getLogger("presidio-streamlit")
8
 
@@ -12,21 +16,70 @@ def create_nlp_engine_with_spacy(
12
  ) -> Tuple[NlpEngine, RecognizerRegistry]:
13
  """
14
  Instantiate an NlpEngine with a spaCy model
15
- :param model_path: spaCy model path.
16
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  registry = RecognizerRegistry()
18
- registry.load_predefined_recognizers()
19
 
20
- if not spacy.util.is_package(model_path):
21
- spacy.cli.download(model_path)
22
 
 
 
 
 
 
 
 
 
23
  nlp_configuration = {
24
- "nlp_engine_name": "spacy",
25
  "models": [{"lang_code": "en", "model_name": model_path}],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  }
27
 
28
  nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
29
 
 
 
 
30
  return nlp_engine, registry
31
 
32
 
@@ -39,41 +92,62 @@ def create_nlp_engine_with_transformers(
39
  would return NlpArtifacts such as POS and lemmas.
40
  :param model_path: HuggingFace model path.
41
  """
 
42
 
43
- from transformers_rec import (
44
- STANFORD_COFIGURATION,
45
- BERT_DEID_CONFIGURATION,
46
- TransformersRecognizer,
47
- )
48
-
49
- registry = RecognizerRegistry()
50
- registry.load_predefined_recognizers()
51
-
52
- if not spacy.util.is_package("en_core_web_sm"):
53
- spacy.cli.download("en_core_web_sm")
54
- # Using a small spaCy model + a HF NER model
55
- transformers_recognizer = TransformersRecognizer(model_path=model_path)
56
-
57
- if model_path == "StanfordAIMI/stanford-deidentifier-base":
58
- transformers_recognizer.load_transformer(**STANFORD_COFIGURATION)
59
- elif model_path == "obi/deid_roberta_i2b2":
60
- transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
61
- else:
62
- print(f"Warning: Model has no configuration, loading default.")
63
- transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
64
-
65
- # Use small spaCy model, no need for both spacy and HF models
66
- # The transformers model is used here as a recognizer, not as an NlpEngine
67
  nlp_configuration = {
68
- "nlp_engine_name": "spacy",
69
- "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  }
71
 
72
- registry.add_recognizer(transformers_recognizer)
73
- registry.remove_recognizer("SpacyRecognizer")
74
-
75
  nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
76
 
 
 
 
77
  return nlp_engine, registry
78
 
79
 
@@ -91,6 +165,8 @@ def create_nlp_engine_with_flair(
91
  registry = RecognizerRegistry()
92
  registry.load_predefined_recognizers()
93
 
 
 
94
  if not spacy.util.is_package("en_core_web_sm"):
95
  spacy.cli.download("en_core_web_sm")
96
  # Using a small spaCy model + a Flair NER model
@@ -107,7 +183,7 @@ def create_nlp_engine_with_flair(
107
  return nlp_engine, registry
108
 
109
 
110
- def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str):
111
  """
112
  Instantiate an NlpEngine with a TextAnalyticsWrapper and a small spaCy model.
113
  The TextAnalyticsWrapper would return results from calling Azure Text Analytics PII, the spaCy model
@@ -115,7 +191,7 @@ def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str):
115
  :param ta_key: Azure Text Analytics key.
116
  :param ta_endpoint: Azure Text Analytics endpoint.
117
  """
118
- from text_analytics_wrapper import TextAnalyticsWrapper
119
 
120
  if not ta_key or not ta_endpoint:
121
  raise RuntimeError("Please fill in the Text Analytics endpoint details")
@@ -123,7 +199,9 @@ def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str):
123
  registry = RecognizerRegistry()
124
  registry.load_predefined_recognizers()
125
 
126
- ta_recognizer = TextAnalyticsWrapper(ta_endpoint=ta_endpoint, ta_key=ta_key)
 
 
127
  nlp_configuration = {
128
  "nlp_engine_name": "spacy",
129
  "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
@@ -131,7 +209,7 @@ def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str):
131
 
132
  nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
133
 
134
- registry.add_recognizer(ta_recognizer)
135
  registry.remove_recognizer("SpacyRecognizer")
136
 
137
  return nlp_engine, registry
 
 
1
  import logging
2
+ from typing import Tuple
3
+
4
  import spacy
5
  from presidio_analyzer import RecognizerRegistry
6
+ from presidio_analyzer.nlp_engine import (
7
+ NlpEngine,
8
+ NlpEngineProvider,
9
+ )
10
 
11
  logger = logging.getLogger("presidio-streamlit")
12
 
 
16
  ) -> Tuple[NlpEngine, RecognizerRegistry]:
17
  """
18
  Instantiate an NlpEngine with a spaCy model
19
+ :param model_path: path to model / model name.
20
  """
21
+ nlp_configuration = {
22
+ "nlp_engine_name": "spacy",
23
+ "models": [{"lang_code": "en", "model_name": model_path}],
24
+ "ner_model_configuration": {
25
+ "model_to_presidio_entity_mapping": {
26
+ "PER": "PERSON",
27
+ "PERSON": "PERSON",
28
+ "NORP": "NRP",
29
+ "FAC": "FACILITY",
30
+ "LOC": "LOCATION",
31
+ "GPE": "LOCATION",
32
+ "LOCATION": "LOCATION",
33
+ "ORG": "ORGANIZATION",
34
+ "ORGANIZATION": "ORGANIZATION",
35
+ "DATE": "DATE_TIME",
36
+ "TIME": "DATE_TIME",
37
+ },
38
+ "low_confidence_score_multiplier": 0.4,
39
+ "low_score_entity_names": ["ORG", "ORGANIZATION"],
40
+ },
41
+ }
42
+
43
+ nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
44
+
45
  registry = RecognizerRegistry()
46
+ registry.load_predefined_recognizers(nlp_engine=nlp_engine)
47
 
48
+ return nlp_engine, registry
 
49
 
50
+
51
+ def create_nlp_engine_with_stanza(
52
+ model_path: str,
53
+ ) -> Tuple[NlpEngine, RecognizerRegistry]:
54
+ """
55
+ Instantiate an NlpEngine with a stanza model
56
+ :param model_path: path to model / model name.
57
+ """
58
  nlp_configuration = {
59
+ "nlp_engine_name": "stanza",
60
  "models": [{"lang_code": "en", "model_name": model_path}],
61
+ "ner_model_configuration": {
62
+ "model_to_presidio_entity_mapping": {
63
+ "PER": "PERSON",
64
+ "PERSON": "PERSON",
65
+ "NORP": "NRP",
66
+ "FAC": "FACILITY",
67
+ "LOC": "LOCATION",
68
+ "GPE": "LOCATION",
69
+ "LOCATION": "LOCATION",
70
+ "ORG": "ORGANIZATION",
71
+ "ORGANIZATION": "ORGANIZATION",
72
+ "DATE": "DATE_TIME",
73
+ "TIME": "DATE_TIME",
74
+ }
75
+ },
76
  }
77
 
78
  nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
79
 
80
+ registry = RecognizerRegistry()
81
+ registry.load_predefined_recognizers(nlp_engine=nlp_engine)
82
+
83
  return nlp_engine, registry
84
 
85
 
 
92
  would return NlpArtifacts such as POS and lemmas.
93
  :param model_path: HuggingFace model path.
94
  """
95
+ print(f"Loading Transformers model: {model_path} of type {type(model_path)}")
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  nlp_configuration = {
98
+ "nlp_engine_name": "transformers",
99
+ "models": [
100
+ {
101
+ "lang_code": "en",
102
+ "model_name": {"spacy": "en_core_web_sm", "transformers": model_path},
103
+ }
104
+ ],
105
+ "ner_model_configuration": {
106
+ "model_to_presidio_entity_mapping": {
107
+ "PER": "PERSON",
108
+ "PERSON": "PERSON",
109
+ "LOC": "LOCATION",
110
+ "LOCATION": "LOCATION",
111
+ "GPE": "LOCATION",
112
+ "ORG": "ORGANIZATION",
113
+ "ORGANIZATION": "ORGANIZATION",
114
+ "NORP": "NRP",
115
+ "AGE": "AGE",
116
+ "ID": "ID",
117
+ "EMAIL": "EMAIL",
118
+ "PATIENT": "PERSON",
119
+ "STAFF": "PERSON",
120
+ "HOSP": "ORGANIZATION",
121
+ "PATORG": "ORGANIZATION",
122
+ "DATE": "DATE_TIME",
123
+ "TIME": "DATE_TIME",
124
+ "PHONE": "PHONE_NUMBER",
125
+ "HCW": "PERSON",
126
+ "HOSPITAL": "ORGANIZATION",
127
+ "FACILITY": "LOCATION",
128
+ },
129
+ "low_confidence_score_multiplier": 0.4,
130
+ "low_score_entity_names": ["ID"],
131
+ "labels_to_ignore": [
132
+ "CARDINAL",
133
+ "EVENT",
134
+ "LANGUAGE",
135
+ "LAW",
136
+ "MONEY",
137
+ "ORDINAL",
138
+ "PERCENT",
139
+ "PRODUCT",
140
+ "QUANTITY",
141
+ "WORK_OF_ART",
142
+ ],
143
+ },
144
  }
145
 
 
 
 
146
  nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
147
 
148
+ registry = RecognizerRegistry()
149
+ registry.load_predefined_recognizers(nlp_engine=nlp_engine)
150
+
151
  return nlp_engine, registry
152
 
153
 
 
165
  registry = RecognizerRegistry()
166
  registry.load_predefined_recognizers()
167
 
168
+ # there is no official Flair NlpEngine, hence we load it as an additional recognizer
169
+
170
  if not spacy.util.is_package("en_core_web_sm"):
171
  spacy.cli.download("en_core_web_sm")
172
  # Using a small spaCy model + a Flair NER model
 
183
  return nlp_engine, registry
184
 
185
 
186
+ def create_nlp_engine_with_azure_ai_language(ta_key: str, ta_endpoint: str):
187
  """
188
  Instantiate an NlpEngine with a TextAnalyticsWrapper and a small spaCy model.
189
  The TextAnalyticsWrapper would return results from calling Azure Text Analytics PII, the spaCy model
 
191
  :param ta_key: Azure Text Analytics key.
192
  :param ta_endpoint: Azure Text Analytics endpoint.
193
  """
194
+ from azure_ai_language_wrapper import AzureAIServiceWrapper
195
 
196
  if not ta_key or not ta_endpoint:
197
  raise RuntimeError("Please fill in the Text Analytics endpoint details")
 
199
  registry = RecognizerRegistry()
200
  registry.load_predefined_recognizers()
201
 
202
+ azure_ai_language_recognizer = AzureAIServiceWrapper(
203
+ ta_endpoint=ta_endpoint, ta_key=ta_key
204
+ )
205
  nlp_configuration = {
206
  "nlp_engine_name": "spacy",
207
  "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
 
209
 
210
  nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
211
 
212
+ registry.add_recognizer(azure_ai_language_recognizer)
213
  registry.remove_recognizer("SpacyRecognizer")
214
 
215
  return nlp_engine, registry
presidio_streamlit.py CHANGED
@@ -56,7 +56,8 @@ model_list = [
56
  "flair/ner-english-large",
57
  "HuggingFace/obi/deid_roberta_i2b2",
58
  "HuggingFace/StanfordAIMI/stanford-deidentifier-base",
59
- "Azure Text Analytics PII",
 
60
  "Other",
61
  ]
62
  if not allow_other_models:
@@ -75,22 +76,22 @@ st_model_package = st_model.split("/")[0]
75
  # Remove package prefix (if needed)
76
  st_model = (
77
  st_model
78
- if st_model_package not in ("spaCy", "HuggingFace")
79
  else "/".join(st_model.split("/")[1:])
80
  )
81
 
82
  if st_model == "Other":
83
  st_model_package = st.sidebar.selectbox(
84
- "NER model OSS package", options=["spaCy", "Flair", "HuggingFace"]
85
  )
86
  st_model = st.sidebar.text_input(f"NER model name", value="")
87
 
88
- if st_model == "Azure Text Analytics PII":
89
  st_ta_key = st.sidebar.text_input(
90
- f"Text Analytics key", value=os.getenv("TA_KEY", ""), type="password"
91
  )
92
  st_ta_endpoint = st.sidebar.text_input(
93
- f"Text Analytics endpoint",
94
  value=os.getenv("TA_ENDPOINT", default=""),
95
  help="For more info: https://learn.microsoft.com/en-us/azure/cognitive-services/language-service/personally-identifiable-information/overview", # noqa: E501
96
  )
@@ -124,16 +125,10 @@ open_ai_params = None
124
 
125
  logger.debug(f"st_operator: {st_operator}")
126
 
127
- if st_operator == "mask":
128
- st_number_of_chars = st.sidebar.number_input(
129
- "number of chars", value=st_number_of_chars, min_value=0, max_value=100
130
- )
131
- st_mask_char = st.sidebar.text_input(
132
- "Mask character", value=st_mask_char, max_chars=1
133
- )
134
- elif st_operator == "encrypt":
135
- st_encrypt_key = st.sidebar.text_input("AES key", value=st_encrypt_key)
136
- elif st_operator == "synthesize":
137
  if os.getenv("OPENAI_TYPE", default="openai") == "Azure":
138
  openai_api_type = "azure"
139
  st_openai_api_base = st.sidebar.text_input(
@@ -161,6 +156,34 @@ elif st_operator == "synthesize":
161
  value=os.getenv("OPENAI_MODEL", default="text-davinci-003"),
162
  help="See more here: https://platform.openai.com/docs/models/",
163
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
  open_ai_params = OpenAIParams(
166
  openai_key=st_openai_key,
@@ -214,7 +237,8 @@ with st.expander("About this demo", expanded=False):
214
  \n\n[Code](https://aka.ms/presidio) |
215
  [Tutorial](https://microsoft.github.io/presidio/tutorial/) |
216
  [Installation](https://microsoft.github.io/presidio/installation/) |
217
- [FAQ](https://microsoft.github.io/presidio/faq/) |"""
 
218
  )
219
 
220
  st.info(
 
56
  "flair/ner-english-large",
57
  "HuggingFace/obi/deid_roberta_i2b2",
58
  "HuggingFace/StanfordAIMI/stanford-deidentifier-base",
59
+ "stanza/en",
60
+ "Azure AI Language",
61
  "Other",
62
  ]
63
  if not allow_other_models:
 
76
  # Remove package prefix (if needed)
77
  st_model = (
78
  st_model
79
+ if st_model_package.lower() not in ("spacy", "stanza", "huggingface")
80
  else "/".join(st_model.split("/")[1:])
81
  )
82
 
83
  if st_model == "Other":
84
  st_model_package = st.sidebar.selectbox(
85
+ "NER model OSS package", options=["spaCy", "stanza", "Flair", "HuggingFace"]
86
  )
87
  st_model = st.sidebar.text_input(f"NER model name", value="")
88
 
89
+ if st_model == "Azure AI Language":
90
  st_ta_key = st.sidebar.text_input(
91
+ f"Azure AI Language key", value=os.getenv("TA_KEY", ""), type="password"
92
  )
93
  st_ta_endpoint = st.sidebar.text_input(
94
+ f"Azure AI Language endpoint",
95
  value=os.getenv("TA_ENDPOINT", default=""),
96
  help="For more info: https://learn.microsoft.com/en-us/azure/cognitive-services/language-service/personally-identifiable-information/overview", # noqa: E501
97
  )
 
125
 
126
  logger.debug(f"st_operator: {st_operator}")
127
 
128
+
129
+ def set_up_openai_synthesis():
130
+ """Set up the OpenAI API key and model for text synthesis."""
131
+
 
 
 
 
 
 
132
  if os.getenv("OPENAI_TYPE", default="openai") == "Azure":
133
  openai_api_type = "azure"
134
  st_openai_api_base = st.sidebar.text_input(
 
156
  value=os.getenv("OPENAI_MODEL", default="text-davinci-003"),
157
  help="See more here: https://platform.openai.com/docs/models/",
158
  )
159
+ return (
160
+ openai_api_type,
161
+ st_openai_api_base,
162
+ st_deployment_name,
163
+ st_openai_version,
164
+ st_openai_key,
165
+ st_openai_model,
166
+ )
167
+
168
+
169
+ if st_operator == "mask":
170
+ st_number_of_chars = st.sidebar.number_input(
171
+ "number of chars", value=st_number_of_chars, min_value=0, max_value=100
172
+ )
173
+ st_mask_char = st.sidebar.text_input(
174
+ "Mask character", value=st_mask_char, max_chars=1
175
+ )
176
+ elif st_operator == "encrypt":
177
+ st_encrypt_key = st.sidebar.text_input("AES key", value=st_encrypt_key)
178
+ elif st_operator == "synthesize":
179
+ (
180
+ openai_api_type,
181
+ st_openai_api_base,
182
+ st_deployment_name,
183
+ st_openai_version,
184
+ st_openai_key,
185
+ st_openai_model,
186
+ ) = set_up_openai_synthesis()
187
 
188
  open_ai_params = OpenAIParams(
189
  openai_key=st_openai_key,
 
237
  \n\n[Code](https://aka.ms/presidio) |
238
  [Tutorial](https://microsoft.github.io/presidio/tutorial/) |
239
  [Installation](https://microsoft.github.io/presidio/installation/) |
240
+ [FAQ](https://microsoft.github.io/presidio/faq/) |
241
+ [Feedback](https://forms.office.com/r/9ufyYjfDaY) |"""
242
  )
243
 
244
  st.info(
test_streamlit.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from presidio_helpers import analyzer_engine, analyze, anonymize
2
+
3
+
4
+ def test_streamlit_logic():
5
+ st_model = "en" # st_model = "StanfordAIMI/stanford-deidentifier-base"
6
+ st_model_package = "stanza" ##st_model_package = "HuggingFace"
7
+ st_ta_key = None
8
+ st_ta_endpoint = None
9
+
10
+ analyzer_params = (st_model_package, st_model, st_ta_key, st_ta_endpoint)
11
+
12
+ # Read default text
13
+ with open("demo_text.txt") as f:
14
+ demo_text = f.readlines()
15
+
16
+ st_text = "".join(demo_text)
17
+
18
+ # instantiate and cache AnalyzerEngine
19
+ analyzer_engine(*analyzer_params)
20
+
21
+ # Analyze
22
+ st_analyze_results = analyze(
23
+ *analyzer_params,
24
+ text=st_text,
25
+ entities="All",
26
+ language="en",
27
+ score_threshold=0.35,
28
+ return_decision_process=True,
29
+ allow_list=[],
30
+ deny_list=[],
31
+ )
32
+
33
+ # Anonymize
34
+ st_anonymize_results = anonymize(
35
+ text=st_text,
36
+ operator="replace",
37
+ mask_char=None,
38
+ number_of_chars=None,
39
+ encrypt_key=None,
40
+ analyze_results=st_analyze_results,
41
+ )
42
+
43
+ assert st_anonymize_results.text != ""