|
import os |
|
from typing import List, Optional |
|
|
|
import dotenv |
|
from azure.ai.textanalytics import TextAnalyticsClient |
|
from azure.core.credentials import AzureKeyCredential |
|
|
|
from presidio_analyzer import EntityRecognizer, RecognizerResult, AnalysisExplanation |
|
from presidio_analyzer.nlp_engine import NlpArtifacts |
|
|
|
class TextAnalyticsWrapper(EntityRecognizer): |
|
from azure.ai.textanalytics._models import PiiEntityCategory |
|
TA_SUPPORTED_ENTITIES = [r.value for r in PiiEntityCategory] |
|
|
|
def __init__( |
|
self, |
|
supported_entities: Optional[List[str]] = None, |
|
supported_language: str = "en", |
|
ta_client: Optional[TextAnalyticsClient] = None, |
|
ta_key: Optional[str] = None, |
|
ta_endpoint: Optional[str] = None, |
|
): |
|
""" |
|
Wrapper for the Azure Text Analytics client |
|
:param ta_client: object of type TextAnalyticsClient |
|
:param ta_key: Azure cognitive Services for Language key |
|
:param ta_endpoint: Azure cognitive Services for Language endpoint |
|
""" |
|
|
|
if not supported_entities: |
|
supported_entities = self.TA_SUPPORTED_ENTITIES |
|
|
|
super().__init__( |
|
supported_entities=supported_entities, |
|
supported_language=supported_language, |
|
name="Azure Text Analytics PII", |
|
) |
|
|
|
self.ta_key = ta_key |
|
self.ta_endpoint = ta_endpoint |
|
|
|
if not ta_client: |
|
ta_client = self.__authenticate_client(ta_key, ta_endpoint) |
|
self.ta_client = ta_client |
|
|
|
@staticmethod |
|
def __authenticate_client(key: str, endpoint: str): |
|
ta_credential = AzureKeyCredential(key) |
|
text_analytics_client = TextAnalyticsClient( |
|
endpoint=endpoint, credential=ta_credential |
|
) |
|
return text_analytics_client |
|
|
|
def analyze( |
|
self, text: str, entities: List[str] = None, nlp_artifacts: NlpArtifacts = None |
|
) -> List[RecognizerResult]: |
|
if not entities: |
|
entities = [] |
|
response = self.ta_client.recognize_pii_entities( |
|
[text], language=self.supported_language |
|
) |
|
results = [doc for doc in response if not doc.is_error] |
|
recognizer_results = [] |
|
for res in results: |
|
for entity in res.entities: |
|
if entity.category not in self.supported_entities: |
|
continue |
|
analysis_explanation = TextAnalyticsWrapper._build_explanation( |
|
original_score=entity.confidence_score, |
|
entity_type=entity.category, |
|
) |
|
recognizer_results.append( |
|
RecognizerResult( |
|
entity_type=entity.category, |
|
start=entity.offset, |
|
end=entity.offset + len(entity.text), |
|
score=entity.confidence_score, |
|
analysis_explanation=analysis_explanation, |
|
) |
|
) |
|
|
|
return recognizer_results |
|
|
|
@staticmethod |
|
def _build_explanation( |
|
original_score: float, entity_type: str |
|
) -> AnalysisExplanation: |
|
explanation = AnalysisExplanation( |
|
recognizer=TextAnalyticsWrapper.__class__.__name__, |
|
original_score=original_score, |
|
textual_explanation=f"Identified as {entity_type} by Text Analytics", |
|
) |
|
return explanation |
|
|
|
def load(self) -> None: |
|
pass |
|
|
|
|
|
if __name__ == "__main__": |
|
import presidio_helpers |
|
dotenv.load_dotenv() |
|
text = """ |
|
Here are a few example sentences we currently support: |
|
|
|
Hello, my name is David Johnson and I live in Maine. |
|
My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ. |
|
|
|
On September 18 I visited microsoft.com and sent an email to [email protected], from the IP 192.168.0.1. |
|
|
|
My passport: 191280342 and my phone number: (212) 555-1234. |
|
|
|
This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544? |
|
|
|
Kate's social security number is 078-05-1126. Her driver license? it is 1234567A. |
|
""" |
|
analyzer = presidio_helpers.analyzer_engine( |
|
model_path="Azure Text Analytics PII", |
|
ta_key=os.environ["TA_KEY"], |
|
ta_endpoint=os.environ["TA_ENDPOINT"], |
|
) |
|
analyzer.analyze(text=text, language="en") |
|
|