jfrery-zama commited on
Commit
646bd9e
·
1 Parent(s): b0042eb

initial commit

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__/
README copy.md ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Encrypted Anonymization Using Fully Homomorphic Encryption
3
+ emoji: 🕵️‍♂️ 🔒
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 3.40.0
8
+ app_file: app.py
9
+ pinned: true
10
+ tags:
11
+ - FHE
12
+ - PPML
13
+ - privacy
14
+ - privacy preserving machine learning
15
+ - data anonymization
16
+ - homomorphic encryption
17
+ - security
18
+ python_version: 3.10.11
19
+ ---
20
+
21
+ # Data Anonymization using FHE
22
+
23
+ ## Run the application locally
24
+
25
+ ### Install the dependencies
26
+
27
+ First, create a virtual env and activate it:
28
+
29
+ ```bash
30
+ python3 -m venv .venv
31
+ source .venv/bin/activate
32
+ ```
33
+
34
+ Then, install the required packages:
35
+
36
+ ```python
37
+ pip3 install pip --upgrade
38
+ pip3 install -U pip wheel setuptools --ignore-installed
39
+ pip3 install -r requirements.txt --ignore-installed
40
+ ```
41
+
42
+ The above steps should only be done once.
43
+
44
+ ## Run the app
45
+
46
+ In a terminal, run:
47
+
48
+ ```bash
49
+ source .venv/bin/activate
50
+ python3 anonymize_app.py
51
+ ```
52
+
53
+ ## Interact with the application
54
+
55
+ Open the given URL link (search for a line like `Running on local URL: http://127.0.0.1:8888/`).
app.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """A Gradio app for anonymizing text data using FHE."""
2
+
3
+ import gradio as gr
4
+ import re
5
+ from fhe_anonymizer import FHEAnonymizer
6
+ import pandas as pd
7
+
8
+
9
+ anonymizer = FHEAnonymizer()
10
+
11
+
12
+ def deidentify_text(input_text):
13
+ anonymized_text, identified_words = anonymizer(input_text)
14
+ # Convert the list of identified words into a DataFrame
15
+ if identified_words: # Ensure there are identified words to process
16
+ identified_df = pd.DataFrame(identified_words, columns=["Identified Words"])
17
+ else:
18
+ identified_df = pd.DataFrame(columns=["Identified Words"])
19
+ return anonymized_text, identified_df
20
+
21
+
22
+ # Default demo text from the file
23
+ with open("demo_text.txt", "r") as file:
24
+ default_demo_text = file.read()
25
+
26
+ demo = gr.Blocks()
27
+
28
+ with demo:
29
+ gr.Markdown(
30
+ """
31
+ <p align="center">
32
+ <img width=200 src="file/images/logos/zama.jpg">
33
+ </p>
34
+ <h1 style="text-align: center;">Encrypted Anonymization Using Fully Homomorphic Encryption</h1>
35
+ <p align="center">
36
+ <a href="https://github.com/zama-ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/github.png">Concrete-ML</a>
37
+
38
+ <a href="https://docs.zama.ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/documentation.png">Documentation</a>
39
+
40
+ <a href="https://zama.ai/community"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/community.png">Community</a>
41
+
42
+ <a href="https://twitter.com/zama_fhe"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/x.png">@zama_fhe</a>
43
+ </p>
44
+ """
45
+ )
46
+
47
+ with gr.Accordion("What is Encrypted Anonymization?", open=False):
48
+ gr.Markdown(
49
+ """
50
+ Encrypted Anonymization leverages Fully Homomorphic Encryption (FHE) to protect sensitive information during data processing. This approach allows for the anonymization of text data, such as personal identifiers, while ensuring that the data remains encrypted throughout the entire process. It enables organizations to utilize sensitive data for analytics and machine learning without compromising individual privacy or security.
51
+ """
52
+ )
53
+
54
+ with gr.Accordion("Why is privacy important in data processing?", open=False):
55
+ gr.Markdown(
56
+ """
57
+ Privacy in data processing is critical to protect individuals' personal information from unauthorized access and potential misuse. With the increasing amount of personal data being collected and analyzed, the risks associated with data breaches and identity theft have also risen. By implementing privacy-preserving techniques, such as encrypted anonymization, organizations can safeguard sensitive information, build trust with their customers, and comply with stringent data protection regulations.
58
+ """
59
+ )
60
+
61
+ with gr.Accordion(
62
+ "How does Fully Homomorphic Encryption enhance data privacy?", open=False
63
+ ):
64
+ gr.Markdown(
65
+ """
66
+ Fully Homomorphic Encryption (FHE) enhances data privacy by enabling computations on encrypted data without needing to decrypt it first. This revolutionary technology ensures that sensitive data can be processed and analyzed securely, without exposing it to potential threats. FHE is a game-changer for privacy-preserving computations, allowing for the secure analysis of encrypted data, which is particularly beneficial in sectors like finance, healthcare, and beyond.
67
+ """
68
+ )
69
+
70
+ gr.Markdown(
71
+ """
72
+ <p align="center">
73
+ <img src="file/images/banner.png">
74
+ </p>
75
+ """
76
+ )
77
+
78
+ with gr.Row():
79
+ input_text = gr.Textbox(value=default_demo_text, lines=13, placeholder="Input text here...", label="Input")
80
+
81
+ anonymized_text_output = gr.Textbox(label="Anonymized Text", lines=13)
82
+
83
+ identified_words_output = gr.Dataframe(label="Identified Words")
84
+
85
+ submit_button = gr.Button("Anonymize")
86
+
87
+ submit_button.click(
88
+ deidentify_text,
89
+ inputs=[input_text],
90
+ outputs=[anonymized_text_output, identified_words_output],
91
+ )
92
+
93
+
94
+ # Launch the app
95
+ demo.launch(share=False)
cml_xgboost.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:431175c3c2bd7591ebfffa3ea45b1096dda5ba7588291252994f9be31db35534
3
+ size 6625266
demo_text.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ Hello, my name is David Johnson and I live in Maine.
2
+ My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.
3
+
4
+ On September 18 I visited microsoft.com and sent an email to [email protected], from the IP 192.168.0.1.
5
+
6
+ My passport: 191280342 and my phone number: (212) 555-1234.
7
+
8
+ This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544?
9
+
10
+ Kate's social security number is 078-05-1126. Her driver license? it is 1234567A.
embedded_model.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28fcf483356bf2bef29b8220b84803acf9518f19fbc9342e76cac06b30803f28
3
+ size 73056
embedded_model.model.wv.vectors_ngrams.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:faf08ed9c3bc29cf71c16f5d2b311f3bfb730a92f12c2e52d742bc6b59bf9e5f
3
+ size 800000128
fhe_anonymizer.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gensim
2
+ import re
3
+ from concrete.ml.deployment import FHEModelClient, FHEModelServer
4
+ from pathlib import Path
5
+ from concrete.ml.common.serialization.loaders import load
6
+
7
+ base_dir = Path(__file__).parent
8
+
9
+ class FHEAnonymizer:
10
+ def __init__(self, punctuation_list=".,!?:;"):
11
+
12
+ self.embeddings_model = gensim.models.FastText.load(str(base_dir / "embedded_model.model"))
13
+ self.punctuation_list = punctuation_list
14
+ with open(base_dir / "cml_xgboost.model", "r") as model_file:
15
+ self.fhe_ner_detection = load(file=model_file)
16
+
17
+ path_to_model = (base_dir / "deployment").resolve()
18
+ self.client = FHEModelClient(path_to_model)
19
+ self.server = FHEModelServer(path_to_model)
20
+ self.client.generate_private_and_evaluation_keys()
21
+ self.evaluation_key = self.client.get_serialized_evaluation_keys()
22
+
23
+ def fhe_inference(self, x):
24
+ enc_x = self.client.quantize_encrypt_serialize(x)
25
+ enc_y = self.server.run(enc_x, self.evaluation_key)
26
+ y = self.client.deserialize_decrypt_dequantize(enc_y)
27
+ return y
28
+
29
+ def __call__(self, text: str):
30
+ text = self.preprocess_sentences(text)
31
+ identified_words = []
32
+ new_text = []
33
+
34
+ for word in text.split():
35
+ # Prediction for each word
36
+ x = self.embeddings_model.wv[word][None]
37
+ prediction = self.fhe_ner_detection.predict(x)
38
+ # prediction = self.fhe_inference(x).argmax(1)[0]
39
+
40
+ if prediction == 1:
41
+ identified_words.append(word)
42
+ new_text.append("<REMOVED>")
43
+ else:
44
+ new_text.append(word)
45
+
46
+ # Joining the modified text
47
+ modified_text = " ".join(new_text)
48
+
49
+ return modified_text, identified_words
50
+
51
+ def preprocess_sentences(self, sentence, verbose=False):
52
+ """Preprocess the sentence."""
53
+
54
+ sentence = re.sub(r'\n+', ' ', sentence)
55
+ if verbose: print(sentence)
56
+
57
+ sentence = re.sub(' +', ' ', sentence)
58
+ if verbose: print(sentence)
59
+
60
+ sentence = re.sub(r"'s\b", " s", sentence)
61
+ if verbose: print(sentence)
62
+
63
+ sentence = re.sub(r'\s([,.!?;:])', r'\1', sentence)
64
+ if verbose: print(sentence)
65
+
66
+ pattern = r'(?<!\w)[{}]|[{}](?!\w)'.format(re.escape(self.punctuation_list), re.escape(self.punctuation_list))
67
+ sentence = re.sub(pattern, '', sentence)
68
+ if verbose: print(sentence)
69
+
70
+ sentence = re.sub(r'\s([,.!?;:])', r'\1', sentence)
71
+ if verbose: print(sentence)
72
+
73
+
74
+ return sentence
images/logos/community.png ADDED
images/logos/documentation.png ADDED
images/logos/github.png ADDED
images/logos/x.png ADDED
images/logos/zama.jpg ADDED
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ concrete-ml==1.5.0rc0
2
+ gensim==4.3.2
3
+ gradio==3.40.1
utils_demo.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import logging
3
+ import re
4
+ import string
5
+ from flair.data import Sentence
6
+ from flair.models import SequenceTagger
7
+ from presidio_analyzer import AnalyzerEngine
8
+ from presidio_anonymizer import AnonymizerEngine
9
+
10
+
11
+
12
+ entity_label_to_code_map = {'<PERSON>': 0,
13
+ '<O>': 1,
14
+ '<MISC>-<NRP>': 2,
15
+ '<NUMBER>': 3,
16
+ '<PER>-<LOCATION>': 4,
17
+ '<LOC>': 5,
18
+ '<MISC>': 6, # Miscellaneous: doesn't fall into the more common categories of PERSON, LOCATION, ORGANIZATION,
19
+ '<DATE_TIME>': 7,
20
+ '<LOCATION>': 8,
21
+ '<PRONOUNS>': 9,
22
+ '<IN_PAN>': 10,
23
+ '<MISC>-<DATE_TIME>': 11,
24
+ '<ORG>': 12,
25
+ '<MISC>-<IN_PAN>': 13,
26
+ '<MISC>-<LOCATION>': 14,
27
+ '<PER>': 15,
28
+ '<MISC>-<PERSON>': 16,
29
+ '<LOC>-<PERSON>': 17,
30
+ '<PHONE_NUMBER>': 18,
31
+ '<LOC>-<DATE_TIME>': 19,
32
+ '<LOC>-<NRP>': 20,
33
+ '<NRP>': 21,
34
+ '<ORG>-<PERSON>': 22,
35
+ '<PER>-<NRP>': 23,
36
+ '<ORG>-<LOCATION>': 24,
37
+ '<PER>-<DATE_TIME>': 25,
38
+ '<PER>-<IN_PAN>': 26,
39
+ '<ORG>-<IN_PAN>': 27,
40
+ '<ORG>-<NRP>': 28,
41
+ '<US_DRIVER_LICENSE>': 29,
42
+ '<KEY <EMAIL_ADDRESS>': 30,
43
+ '<US_BANK_NUMBER>': 33,
44
+ '<IN_AADHAAR>': 34,
45
+ '<CRYPTO>': 35,
46
+ '<IP_ADDRESS>': 36,
47
+ '<EMAIL_ADDRESS>': 35,
48
+ '<US_PASSPORT>': 36,
49
+ '<US_SSN>': 37,
50
+ '<MISC>-<URL>': 38}
51
+
52
+
53
+ pronoun_list = [
54
+ 'I', 'i', 'me', 'my', 'mine', 'myself', 'you', 'your', 'yours', "I'm", "I am",\
55
+ 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "i'm", \
56
+ 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', \
57
+ 'their', 'theirs', 'themselves', 'we', 'us', 'our', 'ours', 'ourselves' \
58
+ 'Me', 'My', 'Mine', 'Myself', 'You', 'Your', 'Yours', 'Yourself', 'Yourselves', \
59
+ 'He', 'Him', 'His', 'Himself', 'She', 'Her', 'Hers', 'Herself', 'It', 'Its', 'Itself', \
60
+ 'They', 'Them', 'Their', 'Theirs', 'Themselves', 'We', 'Us', 'Our', 'Ours', 'Ourselves',
61
+ "Lady", "Madam", "Mr.", "Mister", "Sir", "Miss", "Ms.", "Mrs.", "Mr"
62
+ ]
63
+
64
+
65
+ privacy_category_codes = {'<PRIVATE>': 1, '<NON_PRIVATE>': 2, '<OTHER>': 3}
66
+
67
+ punctuation_list = list(string.punctuation)
68
+ punctuation_list.remove('%')
69
+ punctuation_list.remove('$')
70
+ punctuation_list = ''.join(punctuation_list)
71
+
72
+ def get_word_boundaries(sentence):
73
+ """ Find the start and end positions of each word in a sentence."""
74
+ return [(match.start(), match.end()) for match in re.finditer(r'[^\s]+', sentence)]
75
+
76
+
77
+ def fuse_ner_labels(flair_ner, presidio_ner, text_type="<PRIVATE>"):
78
+ """Merges The NER labels from 'Flair' and 'Presidio' for a given text.
79
+
80
+ We add take into account custom cases and predefined rules for entity classification.
81
+ """
82
+ merged_ner = []
83
+
84
+ # Sanity check
85
+ assert len(flair_ner) == len(presidio_ner)
86
+
87
+ for i, ((w1, n1), (w2, n2)) in enumerate(zip(presidio_ner, flair_ner)):
88
+
89
+ assert w1 == w2
90
+
91
+ if w1.lower() in pronoun_list:
92
+ common_ner = "<PRONOUNS>"
93
+ # elif w1 in ['A+', 'A-', 'B+', 'B-', 'AB+', 'AB-', 'O+', 'O-']:
94
+ # common_ner = "<PRIVATE>"
95
+ elif n1 == "<O>" and n2 == "<O>":
96
+ if w1.lower() in ["am", "'m"] and (i - 1) >= 0 and presidio_ner[i - 1][0].lower() == 'i':
97
+ common_ner = "<PRONOUNS>"
98
+
99
+ elif bool(re.match(r'(?<!\S)[\$€]?(?:\d{1,3}(?:[ ,.]\d{3})*|\d+)(?:\.\d+)?%?', w1)):
100
+ common_ner = "<NUMBER>"
101
+ else:
102
+ common_ner = '<O>'
103
+ elif n1 in n2:
104
+ common_ner = n2
105
+ elif n1 == '<O>' and n2 != '<O>':
106
+ common_ner = n2
107
+ elif n2 == '<O>' and n1 != '<O>':
108
+ common_ner = f"<{n1}>"
109
+ else:
110
+ common_ner = f"<{n1}>-{n2}"
111
+ try:
112
+ common_binary_label = 0 if common_ner =="<O>" else 1
113
+
114
+ except:
115
+ print(f"ERROR: common_binary_label = 0 if common_ner =='<O>' else 1 | {w1=}, {w2=}, {n1=}, {n2=}")
116
+
117
+ if common_ner not in entity_label_to_code_map.keys():
118
+ common_multi_label = len(entity_label_to_code_map)
119
+ if common_ner not in entity_label_to_code_map.keys():
120
+ print("NOT in KEY", common_ner)
121
+ entity_label_to_code_map[common_ner] = common_multi_label
122
+ else:
123
+ common_multi_label = entity_label_to_code_map[common_ner]
124
+
125
+ is_private = text_type if common_ner != '<O>' else '<OTHER>'
126
+
127
+ merged_ner.append([w1, common_ner, is_private, privacy_category_codes[is_private], common_binary_label, common_multi_label])
128
+
129
+ return merged_ner
130
+
131
+ analyzer = AnalyzerEngine()
132
+ anonymizer = AnonymizerEngine()
133
+
134
+
135
+ def apply_presidio_model(sentence, verbose=True):
136
+ """Get Presidio predictions."""
137
+
138
+ if verbose: print(f"{sentence=}")
139
+ # anonymized_text looks like: ['<PERSON>', 'went', 'to', 'Pitier', 'Hospital', ...]
140
+
141
+ anonymized_text = anonymizer.anonymize(text=sentence, analyzer_results=analyzer.analyze(text=sentence, language='en'))
142
+ anonymized_text = anonymized_text.__dict__['text'].split()
143
+ anonymized_text = ' '.join(anonymized_text)
144
+ next_word_to_concate = None
145
+
146
+ if verbose: print(f"{anonymized_text=}")
147
+ if verbose: print(f"{anonymized_text.split('<')=}")
148
+
149
+ start_index, label = 0, []
150
+ previous_label = None
151
+
152
+ for i, before_split in enumerate(anonymized_text.split('<')):
153
+
154
+ if verbose:
155
+ print(f"\nSubseq_{i}: {before_split=}")
156
+
157
+ if i == 0:
158
+ assert len(before_split) == len(sentence[start_index: len(before_split)])
159
+ start_index = len(before_split)
160
+ label.extend([(s, '<O>') for s in before_split.split()])
161
+ else:
162
+ after_split = before_split.split(">")
163
+ if verbose:
164
+ print(f" -----> ", after_split)
165
+ print(sentence[start_index:])
166
+ print(sentence[start_index:].find(after_split[-1]))
167
+
168
+ start2_index = start_index + sentence[start_index:].find(after_split[-1])
169
+ end2_index = start2_index + len(after_split[-1])
170
+
171
+ if verbose:
172
+ print(f"Sanity check: '[{sentence[start2_index: end2_index]}]' VS '[{after_split[-1]}]'")
173
+ print(f"Hidden part: sentence[{start2_index}: {end2_index}] = {sentence[start2_index: end2_index]}")
174
+
175
+ assert sentence[start2_index: end2_index] == after_split[-1]
176
+
177
+ start2_index = start2_index if start2_index != start_index else len(sentence)
178
+
179
+ for j, anonimyzed_word in enumerate((sentence[start_index: start2_index]).split()):
180
+ if next_word_to_concate != None and j == 0:
181
+ label.append((f"{next_word_to_concate}{anonimyzed_word}", f"<{after_split[0]}>"))
182
+ next_word_to_concate = None
183
+ else:
184
+ label.append((anonimyzed_word, f"<{after_split[0]}>"))
185
+
186
+ previous_label = f"<{after_split[0]}>"
187
+
188
+ if len(sentence[start2_index: end2_index]) >= 1 and after_split[-1][-1] != ' ' and i != len(anonymized_text.split('<')) - 1:
189
+ if verbose: print("Is there a space after?", after_split, after_split[-1][-1], i, len(anonymized_text.split('<')))
190
+
191
+ for j, anonimyzed_word in enumerate((after_split[-1]).split()[:-1]):
192
+ label.append((anonimyzed_word, "<O>"))
193
+
194
+ next_word_to_concate = (after_split[-1]).split()[-1]
195
+
196
+ elif len(sentence[start2_index: end2_index]) >= 1 and after_split[-1][0] != ' ' and i != len(anonymized_text.split('<')) - 1:
197
+ if verbose: print("Is there a space before?", after_split, after_split[-1][0], i, len(anonymized_text.split('<')))
198
+
199
+ label[-1] = (f"{label[-1][0]}{after_split[-1].split()[0]}", previous_label)
200
+
201
+ for j, anonimyzed_word in enumerate((after_split[-1]).split()[1:]):
202
+ label.append((anonimyzed_word, "<O>"))
203
+
204
+ else:
205
+ for j, anonimyzed_word in enumerate((after_split[-1]).split()):
206
+ label.append((anonimyzed_word, "<O>"))
207
+
208
+ start_index = end2_index
209
+
210
+ return label
211
+
212
+
213
+ def apply_flair_model(original_sentence):
214
+ """Get Flair predictions."""
215
+
216
+ logging.getLogger('flair').setLevel(logging.WARNING)
217
+
218
+ tagger = SequenceTagger.load("flair/ner-english-large")
219
+ flair_sentence = Sentence(original_sentence)
220
+ tagger.predict(flair_sentence)
221
+
222
+ word_boundaries = get_word_boundaries(original_sentence)
223
+
224
+ ner = [[i_token.form, \
225
+ b_token.get_label().value, \
226
+ i_token.get_label().score, \
227
+ i_token.start_position, \
228
+ i_token.end_position] for b_token in flair_sentence.get_spans("ner") for i_token in b_token]
229
+
230
+ ner_labels, ner_index = [], 0
231
+
232
+ for start, end in word_boundaries:
233
+ word_from_text = original_sentence[start:end]
234
+ if ner_index < len(ner):
235
+ form, label, _, s, e = ner[ner_index]
236
+
237
+ if (s, e) == (start, end) and word_from_text == form:
238
+ ner_labels.append((word_from_text, label))
239
+ ner_index += 1
240
+ else:
241
+ ner_labels.append((word_from_text, "<O>"))
242
+ else:
243
+ ner_labels.append((word_from_text, "<O>"))
244
+
245
+ assert len(ner_labels) == len(word_boundaries)
246
+
247
+ return ner_labels
248
+
249
+
250
+ def preprocess_sentences(sentence, verbose=False):
251
+ """Preprocess the sentence."""
252
+
253
+ # Removing Extra Newlines:
254
+ sentence = re.sub(r'\n+', ' ', sentence)
255
+ if verbose: print(sentence)
256
+
257
+ # Collapsing Multiple Spaces:
258
+ sentence = re.sub(' +', ' ', sentence)
259
+ if verbose: print(sentence)
260
+
261
+ # Handling Apostrophes in Possessives:
262
+ sentence = re.sub(r"'s\b", " s", sentence)
263
+ if verbose: print(sentence)
264
+
265
+ # Removing Spaces Before Punctuation:
266
+ sentence = re.sub(r'\s([,.!?;:])', r'\1', sentence)
267
+ if verbose: print(sentence)
268
+
269
+ # Pattern for Matching Leading or Trailing Punctuation:
270
+ pattern = r'(?<!\w)[{}]|[{}](?!\w)'.format(re.escape(punctuation_list), re.escape(punctuation_list))
271
+ sentence = re.sub(pattern, '', sentence)
272
+ if verbose: print(sentence)
273
+
274
+ return sentence