jfrery-zama
commited on
Commit
·
646bd9e
1
Parent(s):
b0042eb
initial commit
Browse files- .gitignore +1 -0
- README copy.md +55 -0
- app.py +95 -0
- cml_xgboost.model +3 -0
- demo_text.txt +10 -0
- embedded_model.model +3 -0
- embedded_model.model.wv.vectors_ngrams.npy +3 -0
- fhe_anonymizer.py +74 -0
- images/logos/community.png +0 -0
- images/logos/documentation.png +0 -0
- images/logos/github.png +0 -0
- images/logos/x.png +0 -0
- images/logos/zama.jpg +0 -0
- requirements.txt +3 -0
- utils_demo.py +274 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__pycache__/
|
README copy.md
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Encrypted Anonymization Using Fully Homomorphic Encryption
|
3 |
+
emoji: 🕵️♂️ 🔒
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: purple
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 3.40.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: true
|
10 |
+
tags:
|
11 |
+
- FHE
|
12 |
+
- PPML
|
13 |
+
- privacy
|
14 |
+
- privacy preserving machine learning
|
15 |
+
- data anonymization
|
16 |
+
- homomorphic encryption
|
17 |
+
- security
|
18 |
+
python_version: 3.10.11
|
19 |
+
---
|
20 |
+
|
21 |
+
# Data Anonymization using FHE
|
22 |
+
|
23 |
+
## Run the application locally
|
24 |
+
|
25 |
+
### Install the dependencies
|
26 |
+
|
27 |
+
First, create a virtual env and activate it:
|
28 |
+
|
29 |
+
```bash
|
30 |
+
python3 -m venv .venv
|
31 |
+
source .venv/bin/activate
|
32 |
+
```
|
33 |
+
|
34 |
+
Then, install the required packages:
|
35 |
+
|
36 |
+
```python
|
37 |
+
pip3 install pip --upgrade
|
38 |
+
pip3 install -U pip wheel setuptools --ignore-installed
|
39 |
+
pip3 install -r requirements.txt --ignore-installed
|
40 |
+
```
|
41 |
+
|
42 |
+
The above steps should only be done once.
|
43 |
+
|
44 |
+
## Run the app
|
45 |
+
|
46 |
+
In a terminal, run:
|
47 |
+
|
48 |
+
```bash
|
49 |
+
source .venv/bin/activate
|
50 |
+
python3 anonymize_app.py
|
51 |
+
```
|
52 |
+
|
53 |
+
## Interact with the application
|
54 |
+
|
55 |
+
Open the given URL link (search for a line like `Running on local URL: http://127.0.0.1:8888/`).
|
app.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""A Gradio app for anonymizing text data using FHE."""
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
import re
|
5 |
+
from fhe_anonymizer import FHEAnonymizer
|
6 |
+
import pandas as pd
|
7 |
+
|
8 |
+
|
9 |
+
anonymizer = FHEAnonymizer()
|
10 |
+
|
11 |
+
|
12 |
+
def deidentify_text(input_text):
|
13 |
+
anonymized_text, identified_words = anonymizer(input_text)
|
14 |
+
# Convert the list of identified words into a DataFrame
|
15 |
+
if identified_words: # Ensure there are identified words to process
|
16 |
+
identified_df = pd.DataFrame(identified_words, columns=["Identified Words"])
|
17 |
+
else:
|
18 |
+
identified_df = pd.DataFrame(columns=["Identified Words"])
|
19 |
+
return anonymized_text, identified_df
|
20 |
+
|
21 |
+
|
22 |
+
# Default demo text from the file
|
23 |
+
with open("demo_text.txt", "r") as file:
|
24 |
+
default_demo_text = file.read()
|
25 |
+
|
26 |
+
demo = gr.Blocks()
|
27 |
+
|
28 |
+
with demo:
|
29 |
+
gr.Markdown(
|
30 |
+
"""
|
31 |
+
<p align="center">
|
32 |
+
<img width=200 src="file/images/logos/zama.jpg">
|
33 |
+
</p>
|
34 |
+
<h1 style="text-align: center;">Encrypted Anonymization Using Fully Homomorphic Encryption</h1>
|
35 |
+
<p align="center">
|
36 |
+
<a href="https://github.com/zama-ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/github.png">Concrete-ML</a>
|
37 |
+
—
|
38 |
+
<a href="https://docs.zama.ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/documentation.png">Documentation</a>
|
39 |
+
—
|
40 |
+
<a href="https://zama.ai/community"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/community.png">Community</a>
|
41 |
+
—
|
42 |
+
<a href="https://twitter.com/zama_fhe"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/x.png">@zama_fhe</a>
|
43 |
+
</p>
|
44 |
+
"""
|
45 |
+
)
|
46 |
+
|
47 |
+
with gr.Accordion("What is Encrypted Anonymization?", open=False):
|
48 |
+
gr.Markdown(
|
49 |
+
"""
|
50 |
+
Encrypted Anonymization leverages Fully Homomorphic Encryption (FHE) to protect sensitive information during data processing. This approach allows for the anonymization of text data, such as personal identifiers, while ensuring that the data remains encrypted throughout the entire process. It enables organizations to utilize sensitive data for analytics and machine learning without compromising individual privacy or security.
|
51 |
+
"""
|
52 |
+
)
|
53 |
+
|
54 |
+
with gr.Accordion("Why is privacy important in data processing?", open=False):
|
55 |
+
gr.Markdown(
|
56 |
+
"""
|
57 |
+
Privacy in data processing is critical to protect individuals' personal information from unauthorized access and potential misuse. With the increasing amount of personal data being collected and analyzed, the risks associated with data breaches and identity theft have also risen. By implementing privacy-preserving techniques, such as encrypted anonymization, organizations can safeguard sensitive information, build trust with their customers, and comply with stringent data protection regulations.
|
58 |
+
"""
|
59 |
+
)
|
60 |
+
|
61 |
+
with gr.Accordion(
|
62 |
+
"How does Fully Homomorphic Encryption enhance data privacy?", open=False
|
63 |
+
):
|
64 |
+
gr.Markdown(
|
65 |
+
"""
|
66 |
+
Fully Homomorphic Encryption (FHE) enhances data privacy by enabling computations on encrypted data without needing to decrypt it first. This revolutionary technology ensures that sensitive data can be processed and analyzed securely, without exposing it to potential threats. FHE is a game-changer for privacy-preserving computations, allowing for the secure analysis of encrypted data, which is particularly beneficial in sectors like finance, healthcare, and beyond.
|
67 |
+
"""
|
68 |
+
)
|
69 |
+
|
70 |
+
gr.Markdown(
|
71 |
+
"""
|
72 |
+
<p align="center">
|
73 |
+
<img src="file/images/banner.png">
|
74 |
+
</p>
|
75 |
+
"""
|
76 |
+
)
|
77 |
+
|
78 |
+
with gr.Row():
|
79 |
+
input_text = gr.Textbox(value=default_demo_text, lines=13, placeholder="Input text here...", label="Input")
|
80 |
+
|
81 |
+
anonymized_text_output = gr.Textbox(label="Anonymized Text", lines=13)
|
82 |
+
|
83 |
+
identified_words_output = gr.Dataframe(label="Identified Words")
|
84 |
+
|
85 |
+
submit_button = gr.Button("Anonymize")
|
86 |
+
|
87 |
+
submit_button.click(
|
88 |
+
deidentify_text,
|
89 |
+
inputs=[input_text],
|
90 |
+
outputs=[anonymized_text_output, identified_words_output],
|
91 |
+
)
|
92 |
+
|
93 |
+
|
94 |
+
# Launch the app
|
95 |
+
demo.launch(share=False)
|
cml_xgboost.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:431175c3c2bd7591ebfffa3ea45b1096dda5ba7588291252994f9be31db35534
|
3 |
+
size 6625266
|
demo_text.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Hello, my name is David Johnson and I live in Maine.
|
2 |
+
My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.
|
3 |
+
|
4 |
+
On September 18 I visited microsoft.com and sent an email to [email protected], from the IP 192.168.0.1.
|
5 |
+
|
6 |
+
My passport: 191280342 and my phone number: (212) 555-1234.
|
7 |
+
|
8 |
+
This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544?
|
9 |
+
|
10 |
+
Kate's social security number is 078-05-1126. Her driver license? it is 1234567A.
|
embedded_model.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:28fcf483356bf2bef29b8220b84803acf9518f19fbc9342e76cac06b30803f28
|
3 |
+
size 73056
|
embedded_model.model.wv.vectors_ngrams.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:faf08ed9c3bc29cf71c16f5d2b311f3bfb730a92f12c2e52d742bc6b59bf9e5f
|
3 |
+
size 800000128
|
fhe_anonymizer.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gensim
|
2 |
+
import re
|
3 |
+
from concrete.ml.deployment import FHEModelClient, FHEModelServer
|
4 |
+
from pathlib import Path
|
5 |
+
from concrete.ml.common.serialization.loaders import load
|
6 |
+
|
7 |
+
base_dir = Path(__file__).parent
|
8 |
+
|
9 |
+
class FHEAnonymizer:
|
10 |
+
def __init__(self, punctuation_list=".,!?:;"):
|
11 |
+
|
12 |
+
self.embeddings_model = gensim.models.FastText.load(str(base_dir / "embedded_model.model"))
|
13 |
+
self.punctuation_list = punctuation_list
|
14 |
+
with open(base_dir / "cml_xgboost.model", "r") as model_file:
|
15 |
+
self.fhe_ner_detection = load(file=model_file)
|
16 |
+
|
17 |
+
path_to_model = (base_dir / "deployment").resolve()
|
18 |
+
self.client = FHEModelClient(path_to_model)
|
19 |
+
self.server = FHEModelServer(path_to_model)
|
20 |
+
self.client.generate_private_and_evaluation_keys()
|
21 |
+
self.evaluation_key = self.client.get_serialized_evaluation_keys()
|
22 |
+
|
23 |
+
def fhe_inference(self, x):
|
24 |
+
enc_x = self.client.quantize_encrypt_serialize(x)
|
25 |
+
enc_y = self.server.run(enc_x, self.evaluation_key)
|
26 |
+
y = self.client.deserialize_decrypt_dequantize(enc_y)
|
27 |
+
return y
|
28 |
+
|
29 |
+
def __call__(self, text: str):
|
30 |
+
text = self.preprocess_sentences(text)
|
31 |
+
identified_words = []
|
32 |
+
new_text = []
|
33 |
+
|
34 |
+
for word in text.split():
|
35 |
+
# Prediction for each word
|
36 |
+
x = self.embeddings_model.wv[word][None]
|
37 |
+
prediction = self.fhe_ner_detection.predict(x)
|
38 |
+
# prediction = self.fhe_inference(x).argmax(1)[0]
|
39 |
+
|
40 |
+
if prediction == 1:
|
41 |
+
identified_words.append(word)
|
42 |
+
new_text.append("<REMOVED>")
|
43 |
+
else:
|
44 |
+
new_text.append(word)
|
45 |
+
|
46 |
+
# Joining the modified text
|
47 |
+
modified_text = " ".join(new_text)
|
48 |
+
|
49 |
+
return modified_text, identified_words
|
50 |
+
|
51 |
+
def preprocess_sentences(self, sentence, verbose=False):
|
52 |
+
"""Preprocess the sentence."""
|
53 |
+
|
54 |
+
sentence = re.sub(r'\n+', ' ', sentence)
|
55 |
+
if verbose: print(sentence)
|
56 |
+
|
57 |
+
sentence = re.sub(' +', ' ', sentence)
|
58 |
+
if verbose: print(sentence)
|
59 |
+
|
60 |
+
sentence = re.sub(r"'s\b", " s", sentence)
|
61 |
+
if verbose: print(sentence)
|
62 |
+
|
63 |
+
sentence = re.sub(r'\s([,.!?;:])', r'\1', sentence)
|
64 |
+
if verbose: print(sentence)
|
65 |
+
|
66 |
+
pattern = r'(?<!\w)[{}]|[{}](?!\w)'.format(re.escape(self.punctuation_list), re.escape(self.punctuation_list))
|
67 |
+
sentence = re.sub(pattern, '', sentence)
|
68 |
+
if verbose: print(sentence)
|
69 |
+
|
70 |
+
sentence = re.sub(r'\s([,.!?;:])', r'\1', sentence)
|
71 |
+
if verbose: print(sentence)
|
72 |
+
|
73 |
+
|
74 |
+
return sentence
|
images/logos/community.png
ADDED
images/logos/documentation.png
ADDED
images/logos/github.png
ADDED
images/logos/x.png
ADDED
images/logos/zama.jpg
ADDED
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
concrete-ml==1.5.0rc0
|
2 |
+
gensim==4.3.2
|
3 |
+
gradio==3.40.1
|
utils_demo.py
ADDED
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import logging
|
3 |
+
import re
|
4 |
+
import string
|
5 |
+
from flair.data import Sentence
|
6 |
+
from flair.models import SequenceTagger
|
7 |
+
from presidio_analyzer import AnalyzerEngine
|
8 |
+
from presidio_anonymizer import AnonymizerEngine
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
entity_label_to_code_map = {'<PERSON>': 0,
|
13 |
+
'<O>': 1,
|
14 |
+
'<MISC>-<NRP>': 2,
|
15 |
+
'<NUMBER>': 3,
|
16 |
+
'<PER>-<LOCATION>': 4,
|
17 |
+
'<LOC>': 5,
|
18 |
+
'<MISC>': 6, # Miscellaneous: doesn't fall into the more common categories of PERSON, LOCATION, ORGANIZATION,
|
19 |
+
'<DATE_TIME>': 7,
|
20 |
+
'<LOCATION>': 8,
|
21 |
+
'<PRONOUNS>': 9,
|
22 |
+
'<IN_PAN>': 10,
|
23 |
+
'<MISC>-<DATE_TIME>': 11,
|
24 |
+
'<ORG>': 12,
|
25 |
+
'<MISC>-<IN_PAN>': 13,
|
26 |
+
'<MISC>-<LOCATION>': 14,
|
27 |
+
'<PER>': 15,
|
28 |
+
'<MISC>-<PERSON>': 16,
|
29 |
+
'<LOC>-<PERSON>': 17,
|
30 |
+
'<PHONE_NUMBER>': 18,
|
31 |
+
'<LOC>-<DATE_TIME>': 19,
|
32 |
+
'<LOC>-<NRP>': 20,
|
33 |
+
'<NRP>': 21,
|
34 |
+
'<ORG>-<PERSON>': 22,
|
35 |
+
'<PER>-<NRP>': 23,
|
36 |
+
'<ORG>-<LOCATION>': 24,
|
37 |
+
'<PER>-<DATE_TIME>': 25,
|
38 |
+
'<PER>-<IN_PAN>': 26,
|
39 |
+
'<ORG>-<IN_PAN>': 27,
|
40 |
+
'<ORG>-<NRP>': 28,
|
41 |
+
'<US_DRIVER_LICENSE>': 29,
|
42 |
+
'<KEY <EMAIL_ADDRESS>': 30,
|
43 |
+
'<US_BANK_NUMBER>': 33,
|
44 |
+
'<IN_AADHAAR>': 34,
|
45 |
+
'<CRYPTO>': 35,
|
46 |
+
'<IP_ADDRESS>': 36,
|
47 |
+
'<EMAIL_ADDRESS>': 35,
|
48 |
+
'<US_PASSPORT>': 36,
|
49 |
+
'<US_SSN>': 37,
|
50 |
+
'<MISC>-<URL>': 38}
|
51 |
+
|
52 |
+
|
53 |
+
pronoun_list = [
|
54 |
+
'I', 'i', 'me', 'my', 'mine', 'myself', 'you', 'your', 'yours', "I'm", "I am",\
|
55 |
+
'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "i'm", \
|
56 |
+
'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', \
|
57 |
+
'their', 'theirs', 'themselves', 'we', 'us', 'our', 'ours', 'ourselves' \
|
58 |
+
'Me', 'My', 'Mine', 'Myself', 'You', 'Your', 'Yours', 'Yourself', 'Yourselves', \
|
59 |
+
'He', 'Him', 'His', 'Himself', 'She', 'Her', 'Hers', 'Herself', 'It', 'Its', 'Itself', \
|
60 |
+
'They', 'Them', 'Their', 'Theirs', 'Themselves', 'We', 'Us', 'Our', 'Ours', 'Ourselves',
|
61 |
+
"Lady", "Madam", "Mr.", "Mister", "Sir", "Miss", "Ms.", "Mrs.", "Mr"
|
62 |
+
]
|
63 |
+
|
64 |
+
|
65 |
+
privacy_category_codes = {'<PRIVATE>': 1, '<NON_PRIVATE>': 2, '<OTHER>': 3}
|
66 |
+
|
67 |
+
punctuation_list = list(string.punctuation)
|
68 |
+
punctuation_list.remove('%')
|
69 |
+
punctuation_list.remove('$')
|
70 |
+
punctuation_list = ''.join(punctuation_list)
|
71 |
+
|
72 |
+
def get_word_boundaries(sentence):
|
73 |
+
""" Find the start and end positions of each word in a sentence."""
|
74 |
+
return [(match.start(), match.end()) for match in re.finditer(r'[^\s]+', sentence)]
|
75 |
+
|
76 |
+
|
77 |
+
def fuse_ner_labels(flair_ner, presidio_ner, text_type="<PRIVATE>"):
|
78 |
+
"""Merges The NER labels from 'Flair' and 'Presidio' for a given text.
|
79 |
+
|
80 |
+
We add take into account custom cases and predefined rules for entity classification.
|
81 |
+
"""
|
82 |
+
merged_ner = []
|
83 |
+
|
84 |
+
# Sanity check
|
85 |
+
assert len(flair_ner) == len(presidio_ner)
|
86 |
+
|
87 |
+
for i, ((w1, n1), (w2, n2)) in enumerate(zip(presidio_ner, flair_ner)):
|
88 |
+
|
89 |
+
assert w1 == w2
|
90 |
+
|
91 |
+
if w1.lower() in pronoun_list:
|
92 |
+
common_ner = "<PRONOUNS>"
|
93 |
+
# elif w1 in ['A+', 'A-', 'B+', 'B-', 'AB+', 'AB-', 'O+', 'O-']:
|
94 |
+
# common_ner = "<PRIVATE>"
|
95 |
+
elif n1 == "<O>" and n2 == "<O>":
|
96 |
+
if w1.lower() in ["am", "'m"] and (i - 1) >= 0 and presidio_ner[i - 1][0].lower() == 'i':
|
97 |
+
common_ner = "<PRONOUNS>"
|
98 |
+
|
99 |
+
elif bool(re.match(r'(?<!\S)[\$€]?(?:\d{1,3}(?:[ ,.]\d{3})*|\d+)(?:\.\d+)?%?', w1)):
|
100 |
+
common_ner = "<NUMBER>"
|
101 |
+
else:
|
102 |
+
common_ner = '<O>'
|
103 |
+
elif n1 in n2:
|
104 |
+
common_ner = n2
|
105 |
+
elif n1 == '<O>' and n2 != '<O>':
|
106 |
+
common_ner = n2
|
107 |
+
elif n2 == '<O>' and n1 != '<O>':
|
108 |
+
common_ner = f"<{n1}>"
|
109 |
+
else:
|
110 |
+
common_ner = f"<{n1}>-{n2}"
|
111 |
+
try:
|
112 |
+
common_binary_label = 0 if common_ner =="<O>" else 1
|
113 |
+
|
114 |
+
except:
|
115 |
+
print(f"ERROR: common_binary_label = 0 if common_ner =='<O>' else 1 | {w1=}, {w2=}, {n1=}, {n2=}")
|
116 |
+
|
117 |
+
if common_ner not in entity_label_to_code_map.keys():
|
118 |
+
common_multi_label = len(entity_label_to_code_map)
|
119 |
+
if common_ner not in entity_label_to_code_map.keys():
|
120 |
+
print("NOT in KEY", common_ner)
|
121 |
+
entity_label_to_code_map[common_ner] = common_multi_label
|
122 |
+
else:
|
123 |
+
common_multi_label = entity_label_to_code_map[common_ner]
|
124 |
+
|
125 |
+
is_private = text_type if common_ner != '<O>' else '<OTHER>'
|
126 |
+
|
127 |
+
merged_ner.append([w1, common_ner, is_private, privacy_category_codes[is_private], common_binary_label, common_multi_label])
|
128 |
+
|
129 |
+
return merged_ner
|
130 |
+
|
131 |
+
analyzer = AnalyzerEngine()
|
132 |
+
anonymizer = AnonymizerEngine()
|
133 |
+
|
134 |
+
|
135 |
+
def apply_presidio_model(sentence, verbose=True):
|
136 |
+
"""Get Presidio predictions."""
|
137 |
+
|
138 |
+
if verbose: print(f"{sentence=}")
|
139 |
+
# anonymized_text looks like: ['<PERSON>', 'went', 'to', 'Pitier', 'Hospital', ...]
|
140 |
+
|
141 |
+
anonymized_text = anonymizer.anonymize(text=sentence, analyzer_results=analyzer.analyze(text=sentence, language='en'))
|
142 |
+
anonymized_text = anonymized_text.__dict__['text'].split()
|
143 |
+
anonymized_text = ' '.join(anonymized_text)
|
144 |
+
next_word_to_concate = None
|
145 |
+
|
146 |
+
if verbose: print(f"{anonymized_text=}")
|
147 |
+
if verbose: print(f"{anonymized_text.split('<')=}")
|
148 |
+
|
149 |
+
start_index, label = 0, []
|
150 |
+
previous_label = None
|
151 |
+
|
152 |
+
for i, before_split in enumerate(anonymized_text.split('<')):
|
153 |
+
|
154 |
+
if verbose:
|
155 |
+
print(f"\nSubseq_{i}: {before_split=}")
|
156 |
+
|
157 |
+
if i == 0:
|
158 |
+
assert len(before_split) == len(sentence[start_index: len(before_split)])
|
159 |
+
start_index = len(before_split)
|
160 |
+
label.extend([(s, '<O>') for s in before_split.split()])
|
161 |
+
else:
|
162 |
+
after_split = before_split.split(">")
|
163 |
+
if verbose:
|
164 |
+
print(f" -----> ", after_split)
|
165 |
+
print(sentence[start_index:])
|
166 |
+
print(sentence[start_index:].find(after_split[-1]))
|
167 |
+
|
168 |
+
start2_index = start_index + sentence[start_index:].find(after_split[-1])
|
169 |
+
end2_index = start2_index + len(after_split[-1])
|
170 |
+
|
171 |
+
if verbose:
|
172 |
+
print(f"Sanity check: '[{sentence[start2_index: end2_index]}]' VS '[{after_split[-1]}]'")
|
173 |
+
print(f"Hidden part: sentence[{start2_index}: {end2_index}] = {sentence[start2_index: end2_index]}")
|
174 |
+
|
175 |
+
assert sentence[start2_index: end2_index] == after_split[-1]
|
176 |
+
|
177 |
+
start2_index = start2_index if start2_index != start_index else len(sentence)
|
178 |
+
|
179 |
+
for j, anonimyzed_word in enumerate((sentence[start_index: start2_index]).split()):
|
180 |
+
if next_word_to_concate != None and j == 0:
|
181 |
+
label.append((f"{next_word_to_concate}{anonimyzed_word}", f"<{after_split[0]}>"))
|
182 |
+
next_word_to_concate = None
|
183 |
+
else:
|
184 |
+
label.append((anonimyzed_word, f"<{after_split[0]}>"))
|
185 |
+
|
186 |
+
previous_label = f"<{after_split[0]}>"
|
187 |
+
|
188 |
+
if len(sentence[start2_index: end2_index]) >= 1 and after_split[-1][-1] != ' ' and i != len(anonymized_text.split('<')) - 1:
|
189 |
+
if verbose: print("Is there a space after?", after_split, after_split[-1][-1], i, len(anonymized_text.split('<')))
|
190 |
+
|
191 |
+
for j, anonimyzed_word in enumerate((after_split[-1]).split()[:-1]):
|
192 |
+
label.append((anonimyzed_word, "<O>"))
|
193 |
+
|
194 |
+
next_word_to_concate = (after_split[-1]).split()[-1]
|
195 |
+
|
196 |
+
elif len(sentence[start2_index: end2_index]) >= 1 and after_split[-1][0] != ' ' and i != len(anonymized_text.split('<')) - 1:
|
197 |
+
if verbose: print("Is there a space before?", after_split, after_split[-1][0], i, len(anonymized_text.split('<')))
|
198 |
+
|
199 |
+
label[-1] = (f"{label[-1][0]}{after_split[-1].split()[0]}", previous_label)
|
200 |
+
|
201 |
+
for j, anonimyzed_word in enumerate((after_split[-1]).split()[1:]):
|
202 |
+
label.append((anonimyzed_word, "<O>"))
|
203 |
+
|
204 |
+
else:
|
205 |
+
for j, anonimyzed_word in enumerate((after_split[-1]).split()):
|
206 |
+
label.append((anonimyzed_word, "<O>"))
|
207 |
+
|
208 |
+
start_index = end2_index
|
209 |
+
|
210 |
+
return label
|
211 |
+
|
212 |
+
|
213 |
+
def apply_flair_model(original_sentence):
|
214 |
+
"""Get Flair predictions."""
|
215 |
+
|
216 |
+
logging.getLogger('flair').setLevel(logging.WARNING)
|
217 |
+
|
218 |
+
tagger = SequenceTagger.load("flair/ner-english-large")
|
219 |
+
flair_sentence = Sentence(original_sentence)
|
220 |
+
tagger.predict(flair_sentence)
|
221 |
+
|
222 |
+
word_boundaries = get_word_boundaries(original_sentence)
|
223 |
+
|
224 |
+
ner = [[i_token.form, \
|
225 |
+
b_token.get_label().value, \
|
226 |
+
i_token.get_label().score, \
|
227 |
+
i_token.start_position, \
|
228 |
+
i_token.end_position] for b_token in flair_sentence.get_spans("ner") for i_token in b_token]
|
229 |
+
|
230 |
+
ner_labels, ner_index = [], 0
|
231 |
+
|
232 |
+
for start, end in word_boundaries:
|
233 |
+
word_from_text = original_sentence[start:end]
|
234 |
+
if ner_index < len(ner):
|
235 |
+
form, label, _, s, e = ner[ner_index]
|
236 |
+
|
237 |
+
if (s, e) == (start, end) and word_from_text == form:
|
238 |
+
ner_labels.append((word_from_text, label))
|
239 |
+
ner_index += 1
|
240 |
+
else:
|
241 |
+
ner_labels.append((word_from_text, "<O>"))
|
242 |
+
else:
|
243 |
+
ner_labels.append((word_from_text, "<O>"))
|
244 |
+
|
245 |
+
assert len(ner_labels) == len(word_boundaries)
|
246 |
+
|
247 |
+
return ner_labels
|
248 |
+
|
249 |
+
|
250 |
+
def preprocess_sentences(sentence, verbose=False):
|
251 |
+
"""Preprocess the sentence."""
|
252 |
+
|
253 |
+
# Removing Extra Newlines:
|
254 |
+
sentence = re.sub(r'\n+', ' ', sentence)
|
255 |
+
if verbose: print(sentence)
|
256 |
+
|
257 |
+
# Collapsing Multiple Spaces:
|
258 |
+
sentence = re.sub(' +', ' ', sentence)
|
259 |
+
if verbose: print(sentence)
|
260 |
+
|
261 |
+
# Handling Apostrophes in Possessives:
|
262 |
+
sentence = re.sub(r"'s\b", " s", sentence)
|
263 |
+
if verbose: print(sentence)
|
264 |
+
|
265 |
+
# Removing Spaces Before Punctuation:
|
266 |
+
sentence = re.sub(r'\s([,.!?;:])', r'\1', sentence)
|
267 |
+
if verbose: print(sentence)
|
268 |
+
|
269 |
+
# Pattern for Matching Leading or Trailing Punctuation:
|
270 |
+
pattern = r'(?<!\w)[{}]|[{}](?!\w)'.format(re.escape(punctuation_list), re.escape(punctuation_list))
|
271 |
+
sentence = re.sub(pattern, '', sentence)
|
272 |
+
if verbose: print(sentence)
|
273 |
+
|
274 |
+
return sentence
|