Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
cordwainersmith
commited on
Commit
โข
1a74358
1
Parent(s):
676f4c4
v3
Browse files
app.py
CHANGED
@@ -1,15 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
import torch
|
3 |
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
4 |
-
import time
|
5 |
-
import json
|
6 |
-
import pandas as pd
|
7 |
-
from datetime import datetime
|
8 |
-
import os
|
9 |
-
from typing import List, Dict, Tuple
|
10 |
-
import re
|
11 |
|
12 |
|
|
|
13 |
MODEL_NAME = "CordwainerSmith/GolemPII-v1"
|
14 |
|
15 |
ENTITY_COLORS = {
|
@@ -27,13 +29,36 @@ ENTITY_COLORS = {
|
|
27 |
"CC_PROVIDER": "#B3FFB3",
|
28 |
}
|
29 |
|
|
|
30 |
EXAMPLE_SENTENCES = [
|
31 |
"ืฉื ืืื: ืชืืื ืืจืืืื ืืกืคืจ ืชืขืืืช ืืืืช: 61453324-8 ืชืืจืื ืืืื: 15/09/1983 ืืชืืืช: ืืจืืืืืจืื 22 ืคืชื ืชืงืืื ืืืงืื 2731711 ืืืืืื: [email protected] ืืืคืื: 054-8884771 ืืคืืืฉื ืื ื ืืื ื ืคืชืจืื ืืช ืืื ืืืืืืื ืืืฉื ืืื ืืฉืืคืืจ ืชืืืืื ืขืืืื. ืืืฉืชืชืฃ ืืชืืงืฉ ืืืฆืื ืืฆืืช ืื ืืฉื ืืคืืืฉื ืืืื ืืฉืจ ืฉืืื ื 5326-1003-5299-5478 ืืกืืจืงืืจื ืขื ืืืจืืช ืงืืข ื 11-77-352300",
|
32 |
]
|
33 |
|
|
|
34 |
MODEL_DETAILS = {
|
35 |
"name": "GolemPII-v1: Hebrew PII Detection Model",
|
36 |
-
"description":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
"base_model": "xlm-roberta-base",
|
38 |
"training_data": "Custom Hebrew PII dataset",
|
39 |
"detected_pii_entities": [
|
@@ -54,12 +79,20 @@ MODEL_DETAILS = {
|
|
54 |
|
55 |
|
56 |
class PIIMaskingModel:
|
|
|
|
|
|
|
|
|
57 |
def __init__(self, model_name: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
self.model_name = model_name
|
59 |
hf_token = st.secrets["hf_token"]
|
60 |
-
self.tokenizer = AutoTokenizer.from_pretrained(
|
61 |
-
model_name, token=hf_token
|
62 |
-
)
|
63 |
self.model = AutoModelForTokenClassification.from_pretrained(
|
64 |
model_name, token=hf_token
|
65 |
)
|
@@ -70,6 +103,23 @@ class PIIMaskingModel:
|
|
70 |
def process_text(
|
71 |
self, text: str
|
72 |
) -> Tuple[str, float, str, List[str], List[str], List[Dict]]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
start_time = time.time()
|
74 |
|
75 |
tokenized_inputs = self.tokenizer(
|
@@ -119,6 +169,21 @@ class PIIMaskingModel:
|
|
119 |
tokens: List[str],
|
120 |
offset_mapping: List[Tuple[int, int]],
|
121 |
) -> Tuple[int, str, int]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
current_entity = labels[i][2:] if labels[i].startswith("B-") else labels[i][2:]
|
123 |
j = i + 1
|
124 |
last_valid_end = offset_mapping[i][1] if offset_mapping[i] else None
|
@@ -130,7 +195,7 @@ class PIIMaskingModel:
|
|
130 |
|
131 |
next_label = labels[j]
|
132 |
|
133 |
-
if next_label.startswith("B-") and tokens[j].startswith("
|
134 |
break
|
135 |
|
136 |
if next_label.startswith("I-") and next_label[2:] != current_entity:
|
@@ -139,7 +204,7 @@ class PIIMaskingModel:
|
|
139 |
if next_label.startswith("I-") and next_label[2:] == current_entity:
|
140 |
last_valid_end = offset_mapping[j][1]
|
141 |
j += 1
|
142 |
-
elif next_label.startswith("B-") and not tokens[j].startswith("
|
143 |
last_valid_end = offset_mapping[j][1]
|
144 |
j += 1
|
145 |
else:
|
@@ -154,6 +219,22 @@ class PIIMaskingModel:
|
|
154 |
original_text: str,
|
155 |
offset_mapping: List[Tuple[int, int]],
|
156 |
) -> Tuple[str, str, List[Dict]]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
privacy_masks = []
|
158 |
current_pos = 0
|
159 |
masked_text_parts = []
|
@@ -219,6 +300,15 @@ class PIIMaskingModel:
|
|
219 |
return ("".join(masked_text_parts), "".join(colored_text_parts), privacy_masks)
|
220 |
|
221 |
def _get_mask_for_entity(self, entity_type: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
return {
|
223 |
"PHONE_NUM": "[ืืืคืื]",
|
224 |
"ID_NUM": "[ืช.ื]",
|
@@ -237,6 +327,9 @@ class PIIMaskingModel:
|
|
237 |
|
238 |
|
239 |
def main():
|
|
|
|
|
|
|
240 |
st.set_page_config(layout="wide")
|
241 |
st.title("๐ฟ GolemPII: Hebrew PII Masking Application ๐ฟ")
|
242 |
|
@@ -245,17 +338,17 @@ def main():
|
|
245 |
<style>
|
246 |
.rtl { direction: rtl; text-align: right; }
|
247 |
.entity-legend { padding: 5px; margin: 2px; border-radius: 3px; display: inline-block; }
|
248 |
-
.masked-text {
|
249 |
-
direction: rtl;
|
250 |
-
text-align: right;
|
251 |
-
line-height: 2;
|
252 |
-
padding: 10px;
|
253 |
-
background-color: #f6f8fa;
|
254 |
-
border-radius: 5px;
|
255 |
-
color: black;
|
256 |
white-space: pre-wrap;
|
257 |
}
|
258 |
-
.main h3 {
|
259 |
margin-bottom: 10px;
|
260 |
}
|
261 |
textarea {
|
|
|
1 |
+
"""
|
2 |
+
This module demonstrates a Streamlit application for masking Personally Identifiable
|
3 |
+
Information (PII) in Hebrew text using the GolemPII-v1 model.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import time
|
7 |
+
from typing import List, Dict, Tuple
|
8 |
+
|
9 |
import streamlit as st
|
10 |
import torch
|
11 |
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
|
14 |
+
# Constants for model name and entity colors
|
15 |
MODEL_NAME = "CordwainerSmith/GolemPII-v1"
|
16 |
|
17 |
ENTITY_COLORS = {
|
|
|
29 |
"CC_PROVIDER": "#B3FFB3",
|
30 |
}
|
31 |
|
32 |
+
# Example sentences for demonstration
|
33 |
EXAMPLE_SENTENCES = [
|
34 |
"ืฉื ืืื: ืชืืื ืืจืืืื ืืกืคืจ ืชืขืืืช ืืืืช: 61453324-8 ืชืืจืื ืืืื: 15/09/1983 ืืชืืืช: ืืจืืืืืจืื 22 ืคืชื ืชืงืืื ืืืงืื 2731711 ืืืืืื: [email protected] ืืืคืื: 054-8884771 ืืคืืืฉื ืื ื ืืื ื ืคืชืจืื ืืช ืืื ืืืืืืื ืืืฉื ืืื ืืฉืืคืืจ ืชืืืืื ืขืืืื. ืืืฉืชืชืฃ ืืชืืงืฉ ืืืฆืื ืืฆืืช ืื ืืฉื ืืคืืืฉื ืืืื ืืฉืจ ืฉืืื ื 5326-1003-5299-5478 ืืกืืจืงืืจื ืขื ืืืจืืช ืงืืข ื 11-77-352300",
|
35 |
]
|
36 |
|
37 |
+
# Model details for display in the sidebar
|
38 |
MODEL_DETAILS = {
|
39 |
"name": "GolemPII-v1: Hebrew PII Detection Model",
|
40 |
+
"description": """
|
41 |
+
The <a href="https://huggingface.co/CordwainerSmith/GolemPII-v1" target="_blank">GolemPII model</a>
|
42 |
+
was specifically designed to identify and categorize various types of personally
|
43 |
+
identifiable information (PII) present in Hebrew text. Its core intended usage
|
44 |
+
revolves around enhancing privacy protection and facilitating the process of data
|
45 |
+
anonymization. This makes it a good candidate for applications and systems that
|
46 |
+
handle sensitive data, such as legal documents, medical records, or any text data
|
47 |
+
containing PII, where the automatic redaction or removal of such information is
|
48 |
+
essential for ensuring compliance with data privacy regulations and safeguarding
|
49 |
+
individuals' personal information. The model can be deployed on-premise with a
|
50 |
+
relatively small hardware footprint, making it suitable for organizations with
|
51 |
+
limited computing resources or those prioritizing local data processing.
|
52 |
+
|
53 |
+
The model was trained on the <a href="https://huggingface.co/datasets/CordwainerSmith/GolemGuard"
|
54 |
+
target="_blank">GolemGuard</a> dataset, a Hebrew language dataset comprising over
|
55 |
+
115,000 examples of PII entities and containing both real and synthetically
|
56 |
+
generated text examples. This data represents various document types and
|
57 |
+
communication formats commonly found in Israeli professional and administrative
|
58 |
+
contexts. GolemGuard covers a wide range of document types and encompasses a
|
59 |
+
diverse array of PII entities, making it ideal for training and evaluating PII
|
60 |
+
detection models.
|
61 |
+
""",
|
62 |
"base_model": "xlm-roberta-base",
|
63 |
"training_data": "Custom Hebrew PII dataset",
|
64 |
"detected_pii_entities": [
|
|
|
79 |
|
80 |
|
81 |
class PIIMaskingModel:
|
82 |
+
"""
|
83 |
+
A class for masking PII in Hebrew text using the GolemPII-v1 model.
|
84 |
+
"""
|
85 |
+
|
86 |
def __init__(self, model_name: str):
|
87 |
+
"""
|
88 |
+
Initializes the PIIMaskingModel with the specified model name.
|
89 |
+
|
90 |
+
Args:
|
91 |
+
model_name: The name of the pre-trained model to use.
|
92 |
+
"""
|
93 |
self.model_name = model_name
|
94 |
hf_token = st.secrets["hf_token"]
|
95 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
|
|
|
|
|
96 |
self.model = AutoModelForTokenClassification.from_pretrained(
|
97 |
model_name, token=hf_token
|
98 |
)
|
|
|
103 |
def process_text(
|
104 |
self, text: str
|
105 |
) -> Tuple[str, float, str, List[str], List[str], List[Dict]]:
|
106 |
+
"""
|
107 |
+
Processes the input text and returns the masked text, processing time,
|
108 |
+
colored text, tokens, predicted labels, and privacy masks.
|
109 |
+
|
110 |
+
Args:
|
111 |
+
text: The input text to process.
|
112 |
+
|
113 |
+
Returns:
|
114 |
+
A tuple containing:
|
115 |
+
- masked_text: The text with PII masked.
|
116 |
+
- processing_time: The time taken to process the text.
|
117 |
+
- colored_text: The text with PII highlighted with colors.
|
118 |
+
- tokens: The tokens of the input text.
|
119 |
+
- predicted_labels: The predicted labels for each token.
|
120 |
+
- privacy_masks: A list of dictionaries containing information about
|
121 |
+
the masked PII entities.
|
122 |
+
"""
|
123 |
start_time = time.time()
|
124 |
|
125 |
tokenized_inputs = self.tokenizer(
|
|
|
169 |
tokens: List[str],
|
170 |
offset_mapping: List[Tuple[int, int]],
|
171 |
) -> Tuple[int, str, int]:
|
172 |
+
"""
|
173 |
+
Finds the span of an entity starting at the given index.
|
174 |
+
|
175 |
+
Args:
|
176 |
+
i: The starting index of the entity.
|
177 |
+
labels: The list of labels for each token.
|
178 |
+
tokens: The list of tokens.
|
179 |
+
offset_mapping: The offset mapping for each token.
|
180 |
+
|
181 |
+
Returns:
|
182 |
+
A tuple containing:
|
183 |
+
- The index of the next token after the entity.
|
184 |
+
- The type of the entity.
|
185 |
+
- The end character offset of the entity.
|
186 |
+
"""
|
187 |
current_entity = labels[i][2:] if labels[i].startswith("B-") else labels[i][2:]
|
188 |
j = i + 1
|
189 |
last_valid_end = offset_mapping[i][1] if offset_mapping[i] else None
|
|
|
195 |
|
196 |
next_label = labels[j]
|
197 |
|
198 |
+
if next_label.startswith("B-") and tokens[j].startswith(" "):
|
199 |
break
|
200 |
|
201 |
if next_label.startswith("I-") and next_label[2:] != current_entity:
|
|
|
204 |
if next_label.startswith("I-") and next_label[2:] == current_entity:
|
205 |
last_valid_end = offset_mapping[j][1]
|
206 |
j += 1
|
207 |
+
elif next_label.startswith("B-") and not tokens[j].startswith(" "):
|
208 |
last_valid_end = offset_mapping[j][1]
|
209 |
j += 1
|
210 |
else:
|
|
|
219 |
original_text: str,
|
220 |
offset_mapping: List[Tuple[int, int]],
|
221 |
) -> Tuple[str, str, List[Dict]]:
|
222 |
+
"""
|
223 |
+
Masks the PII entities in a sentence.
|
224 |
+
|
225 |
+
Args:
|
226 |
+
tokens: The list of tokens in the sentence.
|
227 |
+
labels: The list of labels for each token.
|
228 |
+
original_text: The original text of the sentence.
|
229 |
+
offset_mapping: The offset mapping for each token.
|
230 |
+
|
231 |
+
Returns:
|
232 |
+
A tuple containing:
|
233 |
+
- The masked text.
|
234 |
+
- The colored text.
|
235 |
+
- A list of dictionaries containing information about the masked
|
236 |
+
PII entities.
|
237 |
+
"""
|
238 |
privacy_masks = []
|
239 |
current_pos = 0
|
240 |
masked_text_parts = []
|
|
|
300 |
return ("".join(masked_text_parts), "".join(colored_text_parts), privacy_masks)
|
301 |
|
302 |
def _get_mask_for_entity(self, entity_type: str) -> str:
|
303 |
+
"""
|
304 |
+
Returns the mask for a given entity type.
|
305 |
+
|
306 |
+
Args:
|
307 |
+
entity_type: The type of the entity.
|
308 |
+
|
309 |
+
Returns:
|
310 |
+
The mask for the entity type.
|
311 |
+
"""
|
312 |
return {
|
313 |
"PHONE_NUM": "[ืืืคืื]",
|
314 |
"ID_NUM": "[ืช.ื]",
|
|
|
327 |
|
328 |
|
329 |
def main():
|
330 |
+
"""
|
331 |
+
The main function for the Streamlit application.
|
332 |
+
"""
|
333 |
st.set_page_config(layout="wide")
|
334 |
st.title("๐ฟ GolemPII: Hebrew PII Masking Application ๐ฟ")
|
335 |
|
|
|
338 |
<style>
|
339 |
.rtl { direction: rtl; text-align: right; }
|
340 |
.entity-legend { padding: 5px; margin: 2px; border-radius: 3px; display: inline-block; }
|
341 |
+
.masked-text {
|
342 |
+
direction: rtl;
|
343 |
+
text-align: right;
|
344 |
+
line-height: 2;
|
345 |
+
padding: 10px;
|
346 |
+
background-color: #f6f8fa;
|
347 |
+
border-radius: 5px;
|
348 |
+
color: black;
|
349 |
white-space: pre-wrap;
|
350 |
}
|
351 |
+
.main h3 {
|
352 |
margin-bottom: 10px;
|
353 |
}
|
354 |
textarea {
|