cordwainersmith commited on
Commit
ecbe10b
·
1 Parent(s): 98a427a
Files changed (1) hide show
  1. app.py +54 -162
app.py CHANGED
@@ -9,11 +9,8 @@ import os
9
  from typing import List, Dict, Tuple
10
  import re
11
 
12
- # Constants
13
- MODELS = {
14
- "GolemPII XLM-RoBERTa v1": "CordwainerSmith/GolemPII-xlm-roberta-v1",
15
- }
16
 
 
17
 
18
  ENTITY_COLORS = {
19
  "PHONE_NUM": "#FF9999",
@@ -35,8 +32,8 @@ EXAMPLE_SENTENCES = [
35
  ]
36
 
37
  MODEL_DETAILS = {
38
- "name": "GolemPII-xlm-roberta-v1 - Hebrew PII Detection Model",
39
- "description": "This model is specifically designed to identify and categorize Personally Identifiable Information (PII) within Hebrew text. It leverages the powerful XLM-RoBERTa base, fine-tuned with a curated Hebrew PII dataset, making it adept at token classification tasks tailored for Hebrew.",
40
  "base_model": "xlm-roberta-base",
41
  "training_data": "Custom Hebrew PII dataset",
42
  "detected_pii_entities": [
@@ -53,17 +50,6 @@ MODEL_DETAILS = {
53
  "DATE",
54
  "POSTAL_CODE",
55
  ],
56
- "performance_metrics": {
57
- "Loss": 0.000729,
58
- "Precision": 0.9982,
59
- "Recall": 0.9982,
60
- "F1-Score": 0.9982,
61
- "Accuracy": 0.999795,
62
- },
63
- "training_details": {
64
- "Training language": "Hebrew",
65
- # Add other relevant training details if available
66
- },
67
  }
68
 
69
 
@@ -133,7 +119,6 @@ class PIIMaskingModel:
133
  tokens: List[str],
134
  offset_mapping: List[Tuple[int, int]],
135
  ) -> Tuple[int, str, int]:
136
- """Find the end index and entity type for a span starting at index i"""
137
  current_entity = labels[i][2:] if labels[i].startswith("B-") else labels[i][2:]
138
  j = i + 1
139
  last_valid_end = offset_mapping[i][1] if offset_mapping[i] else None
@@ -145,19 +130,15 @@ class PIIMaskingModel:
145
 
146
  next_label = labels[j]
147
 
148
- # Stop if we hit a new B- tag (except for non-spaced tokens)
149
  if next_label.startswith("B-") and tokens[j].startswith("▁"):
150
  break
151
 
152
- # Stop if we hit a different entity type in I- tags
153
  if next_label.startswith("I-") and next_label[2:] != current_entity:
154
  break
155
 
156
- # Continue if it's a continuation of the same entity
157
  if next_label.startswith("I-") and next_label[2:] == current_entity:
158
  last_valid_end = offset_mapping[j][1]
159
  j += 1
160
- # Continue if it's a non-spaced B- token
161
  elif next_label.startswith("B-") and not tokens[j].startswith("▁"):
162
  last_valid_end = offset_mapping[j][1]
163
  j += 1
@@ -180,7 +161,7 @@ class PIIMaskingModel:
180
 
181
  i = 0
182
  while i < len(tokens):
183
- if offset_mapping[i] is None: # Skip special tokens
184
  i += 1
185
  continue
186
 
@@ -188,23 +169,18 @@ class PIIMaskingModel:
188
 
189
  if current_label.startswith(("B-", "I-")):
190
  start_char = offset_mapping[i][0]
191
-
192
- # Find the complete entity span
193
  next_pos, entity_type, last_valid_end = self._find_entity_span(
194
  i, labels, tokens, offset_mapping
195
  )
196
 
197
- # Add any text before the entity
198
  if current_pos < start_char:
199
  text_before = original_text[current_pos:start_char]
200
  masked_text_parts.append(text_before)
201
  colored_text_parts.append(text_before)
202
 
203
- # Extract and mask the entity
204
  entity_value = original_text[start_char:last_valid_end]
205
  mask = self._get_mask_for_entity(entity_type)
206
 
207
- # Add to privacy masks
208
  privacy_masks.append(
209
  {
210
  "label": entity_type,
@@ -215,13 +191,10 @@ class PIIMaskingModel:
215
  }
216
  )
217
 
218
- # Add masked text
219
  masked_text_parts.append(mask)
220
-
221
- # Add colored text
222
  color = ENTITY_COLORS.get(entity_type, "#CCCCCC")
223
  colored_text_parts.append(
224
- f'<span style="background-color: {color}; padding: 2px; border-radius: 3px;">{mask}</span>'
225
  )
226
 
227
  current_pos = last_valid_end
@@ -231,7 +204,6 @@ class PIIMaskingModel:
231
  start_char = offset_mapping[i][0]
232
  end_char = offset_mapping[i][1]
233
 
234
- # Add any text for this token
235
  if current_pos < end_char:
236
  text_chunk = original_text[current_pos:end_char]
237
  masked_text_parts.append(text_chunk)
@@ -239,7 +211,6 @@ class PIIMaskingModel:
239
  current_pos = end_char
240
  i += 1
241
 
242
- # Add any remaining text
243
  if current_pos < len(original_text):
244
  remaining_text = original_text[current_pos:]
245
  masked_text_parts.append(remaining_text)
@@ -248,7 +219,6 @@ class PIIMaskingModel:
248
  return ("".join(masked_text_parts), "".join(colored_text_parts), privacy_masks)
249
 
250
  def _get_mask_for_entity(self, entity_type: str) -> str:
251
- """Get the mask text for a given entity type"""
252
  return {
253
  "PHONE_NUM": "[טלפון]",
254
  "ID_NUM": "[ת.ז]",
@@ -266,24 +236,10 @@ class PIIMaskingModel:
266
  }.get(entity_type, f"[{entity_type}]")
267
 
268
 
269
- def save_results_to_file(results: Dict):
270
- """
271
- Save processing results to a JSON file
272
- """
273
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
274
- filename = f"pii_masking_results_{timestamp}.json"
275
-
276
- with open(filename, "w", encoding="utf-8") as f:
277
- json.dump(results, f, ensure_ascii=False, indent=2)
278
-
279
- return filename
280
-
281
-
282
  def main():
283
  st.set_page_config(layout="wide")
284
  st.title("🗿 GolemPII: Hebrew PII Masking Application 🗿")
285
 
286
- # Add CSS styles
287
  st.markdown(
288
  """
289
  <style>
@@ -299,155 +255,91 @@ def main():
299
  color: black;
300
  white-space: pre-wrap;
301
  }
302
- /* Red headers for sections */
303
- .main h3 {
304
- color: #d73a49;
305
  margin-bottom: 10px;
306
  }
307
- /* Styles for the model details sidebar */
308
- .model-details-sidebar h2 {
309
- margin-top: 0;
310
- }
311
- .model-details-sidebar table {
312
- width: 100%;
313
- border-collapse: collapse;
314
  }
315
- .model-details-sidebar td, .model-details-sidebar th {
316
- padding: 8px;
317
- border: 1px solid #ddd;
318
- text-align: left;
319
  }
320
  </style>
321
  """,
322
  unsafe_allow_html=True,
323
  )
324
 
325
- # Sidebar configuration
326
- st.sidebar.header("Configuration")
327
- selected_model = st.sidebar.selectbox("Select Model", list(MODELS.keys()))
328
- show_json = st.sidebar.checkbox("Show JSON Output", value=True)
329
- run_all_models = st.sidebar.checkbox("Run All Models")
330
-
331
- # Display Model Details in Sidebar
332
  st.sidebar.markdown(
333
  f"""
334
- <div class="model-details-sidebar">
335
- <h2>Model Details: {MODEL_DETAILS['name']}</h2>
336
  <p>{MODEL_DETAILS['description']}</p>
337
- <table>
338
- <tr><td>Base Model:</td><td>{MODEL_DETAILS['base_model']}</td></tr>
339
- <tr><td>Training Data:</td><td>{MODEL_DETAILS['training_data']}</td></tr>
340
- </table>
341
- <h3>Detected PII Entities</h3>
342
  <ul>
343
- {" ".join([f'<li><span class="entity-badge" style="background-color: {ENTITY_COLORS.get(entity, "#CCCCCC")}; padding: 3px 5px; border-radius: 3px; margin-right: 5px;">{entity}</span></li>' for entity in MODEL_DETAILS['detected_pii_entities']])}
344
  </ul>
345
  </div>
346
- """,
347
  unsafe_allow_html=True,
348
  )
349
 
350
- # Text input
351
  text_input = st.text_area(
352
  "Enter text to mask (separate multiple texts with commas):",
353
  value="\n".join(EXAMPLE_SENTENCES),
354
  height=200,
355
  )
356
 
357
- # Process button
 
358
  if st.button("Process Text"):
359
  texts = [text.strip() for text in text_input.split(",") if text.strip()]
360
-
361
- if run_all_models:
362
- all_results = {}
363
- progress_bar = st.progress(0)
364
-
365
- for idx, (model_name, model_path) in enumerate(MODELS.items()):
366
- st.subheader(f"Results for {model_name}")
367
- model = PIIMaskingModel(model_path)
368
- model_results = {}
369
-
370
- for text_idx, text in enumerate(texts):
371
- (
372
- masked_text,
373
- processing_time,
374
- colored_text,
375
- tokens,
376
- predicted_labels,
377
- privacy_masks,
378
- ) = model.process_text(text)
379
- model_results[f"text_{text_idx+1}"] = {
 
 
 
 
 
 
 
 
 
 
 
380
  "original": text,
381
  "masked": masked_text,
382
  "processing_time": processing_time,
 
 
383
  "privacy_mask": privacy_masks,
384
  "span_labels": [
385
  [m["start"], m["end"], m["label"]] for m in privacy_masks
386
  ],
387
  }
388
-
389
- all_results[model_name] = model_results
390
- progress_bar.progress((idx + 1) / len(MODELS))
391
-
392
- # Save and display results
393
- filename = save_results_to_file(all_results)
394
- st.success(f"Results saved to {filename}")
395
-
396
- # Show comparison table
397
- comparison_data = []
398
- for model_name, results in all_results.items():
399
- avg_time = sum(
400
- text_data["processing_time"] for text_data in results.values()
401
- ) / len(results)
402
- comparison_data.append(
403
- {"Model": model_name, "Avg Processing Time": f"{avg_time:.3f}s"}
404
- )
405
-
406
- st.subheader("Model Comparison")
407
- st.table(pd.DataFrame(comparison_data))
408
-
409
- else:
410
- # Process with single selected model
411
- model = PIIMaskingModel(MODELS[selected_model])
412
-
413
- for text in texts:
414
- st.markdown("### Original Text", unsafe_allow_html=True)
415
- st.markdown(f'<div class="rtl">{text}</div>', unsafe_allow_html=True)
416
-
417
- (
418
- masked_text,
419
- processing_time,
420
- colored_text,
421
- tokens,
422
- predicted_labels,
423
- privacy_masks,
424
- ) = model.process_text(text)
425
-
426
- st.markdown("### Masked Text", unsafe_allow_html=True)
427
- st.markdown(
428
- f'<div class="masked-text">{colored_text}</div>',
429
- unsafe_allow_html=True,
430
  )
431
 
432
- st.markdown(f"Processing Time: {processing_time:.3f} seconds")
433
-
434
- if show_json:
435
- st.json(
436
- {
437
- "original": text,
438
- "masked": masked_text,
439
- "processing_time": processing_time,
440
- "tokens": tokens,
441
- "token_classes": predicted_labels,
442
- "privacy_mask": privacy_masks,
443
- "span_labels": [
444
- [m["start"], m["end"], m["label"]]
445
- for m in privacy_masks
446
- ],
447
- }
448
- )
449
-
450
- st.markdown("---")
451
 
452
 
453
  if __name__ == "__main__":
 
9
  from typing import List, Dict, Tuple
10
  import re
11
 
 
 
 
 
12
 
13
+ MODEL_PATH = "CordwainerSmith/GolemPII-v1"
14
 
15
  ENTITY_COLORS = {
16
  "PHONE_NUM": "#FF9999",
 
32
  ]
33
 
34
  MODEL_DETAILS = {
35
+ "name": "GolemPII-v1: Hebrew PII Detection Model",
36
+ "description": 'The <a href="https://huggingface.co/CordwainerSmith/GolemPII-v1" target="_blank">GolemPII model</a> was specifically designed to identify and categorize various types of personally identifiable information (PII) present in Hebrew text. Its core intended usage revolves around enhancing privacy protection and facilitating the process of data anonymization. This makes it a good candidate for applications and systems that handle sensitive data, such as legal documents, medical records, or any text data containing PII, where the automatic redaction or removal of such information is essential for ensuring compliance with data privacy regulations and safeguarding individuals\' personal information. The model can be deployed on-premise with a relatively small hardware footprint, making it suitable for organizations with limited computing resources or those prioritizing local data processing.\n\nThe model was trained on the <a href="https://huggingface.co/datasets/CordwainerSmith/GolemGuard" target="_blank">GolemGuard</a> dataset, a Hebrew language dataset comprising over 115,000 examples of PII entities and containing both real and synthetically generated text examples. This data represents various document types and communication formats commonly found in Israeli professional and administrative contexts. GolemGuard covers a wide range of document types and encompasses a diverse array of PII entities, making it ideal for training and evaluating PII detection models.',
37
  "base_model": "xlm-roberta-base",
38
  "training_data": "Custom Hebrew PII dataset",
39
  "detected_pii_entities": [
 
50
  "DATE",
51
  "POSTAL_CODE",
52
  ],
 
 
 
 
 
 
 
 
 
 
 
53
  }
54
 
55
 
 
119
  tokens: List[str],
120
  offset_mapping: List[Tuple[int, int]],
121
  ) -> Tuple[int, str, int]:
 
122
  current_entity = labels[i][2:] if labels[i].startswith("B-") else labels[i][2:]
123
  j = i + 1
124
  last_valid_end = offset_mapping[i][1] if offset_mapping[i] else None
 
130
 
131
  next_label = labels[j]
132
 
 
133
  if next_label.startswith("B-") and tokens[j].startswith("▁"):
134
  break
135
 
 
136
  if next_label.startswith("I-") and next_label[2:] != current_entity:
137
  break
138
 
 
139
  if next_label.startswith("I-") and next_label[2:] == current_entity:
140
  last_valid_end = offset_mapping[j][1]
141
  j += 1
 
142
  elif next_label.startswith("B-") and not tokens[j].startswith("▁"):
143
  last_valid_end = offset_mapping[j][1]
144
  j += 1
 
161
 
162
  i = 0
163
  while i < len(tokens):
164
+ if offset_mapping[i] is None:
165
  i += 1
166
  continue
167
 
 
169
 
170
  if current_label.startswith(("B-", "I-")):
171
  start_char = offset_mapping[i][0]
 
 
172
  next_pos, entity_type, last_valid_end = self._find_entity_span(
173
  i, labels, tokens, offset_mapping
174
  )
175
 
 
176
  if current_pos < start_char:
177
  text_before = original_text[current_pos:start_char]
178
  masked_text_parts.append(text_before)
179
  colored_text_parts.append(text_before)
180
 
 
181
  entity_value = original_text[start_char:last_valid_end]
182
  mask = self._get_mask_for_entity(entity_type)
183
 
 
184
  privacy_masks.append(
185
  {
186
  "label": entity_type,
 
191
  }
192
  )
193
 
 
194
  masked_text_parts.append(mask)
 
 
195
  color = ENTITY_COLORS.get(entity_type, "#CCCCCC")
196
  colored_text_parts.append(
197
+ f'<span style="background-color: {color}; color: black; padding: 2px; border-radius: 3px;">{mask}</span>'
198
  )
199
 
200
  current_pos = last_valid_end
 
204
  start_char = offset_mapping[i][0]
205
  end_char = offset_mapping[i][1]
206
 
 
207
  if current_pos < end_char:
208
  text_chunk = original_text[current_pos:end_char]
209
  masked_text_parts.append(text_chunk)
 
211
  current_pos = end_char
212
  i += 1
213
 
 
214
  if current_pos < len(original_text):
215
  remaining_text = original_text[current_pos:]
216
  masked_text_parts.append(remaining_text)
 
219
  return ("".join(masked_text_parts), "".join(colored_text_parts), privacy_masks)
220
 
221
  def _get_mask_for_entity(self, entity_type: str) -> str:
 
222
  return {
223
  "PHONE_NUM": "[טלפון]",
224
  "ID_NUM": "[ת.ז]",
 
236
  }.get(entity_type, f"[{entity_type}]")
237
 
238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  def main():
240
  st.set_page_config(layout="wide")
241
  st.title("🗿 GolemPII: Hebrew PII Masking Application 🗿")
242
 
 
243
  st.markdown(
244
  """
245
  <style>
 
255
  color: black;
256
  white-space: pre-wrap;
257
  }
258
+ .main h3 {
 
 
259
  margin-bottom: 10px;
260
  }
261
+ textarea {
262
+ direction: rtl !important;
263
+ text-align: right !important;
 
 
 
 
264
  }
265
+ .stTextArea label {
266
+ direction: ltr !important;
267
+ text-align: left !important;
 
268
  }
269
  </style>
270
  """,
271
  unsafe_allow_html=True,
272
  )
273
 
274
+ # Sidebar with model details
 
 
 
 
 
 
275
  st.sidebar.markdown(
276
  f"""
277
+ <div>
278
+ <h2>{MODEL_DETAILS['name']}</h2>
279
  <p>{MODEL_DETAILS['description']}</p>
280
+ <h3>Supported PII Entities</h3>
 
 
 
 
281
  <ul>
282
+ {" ".join([f'<li><span style="background-color: {ENTITY_COLORS.get(entity, "#CCCCCC")}; color: black; padding: 3px 5px; border-radius: 3px; margin-right: 5px;">{entity}</span></li>' for entity in MODEL_DETAILS['detected_pii_entities']])}
283
  </ul>
284
  </div>
285
+ """,
286
  unsafe_allow_html=True,
287
  )
288
 
 
289
  text_input = st.text_area(
290
  "Enter text to mask (separate multiple texts with commas):",
291
  value="\n".join(EXAMPLE_SENTENCES),
292
  height=200,
293
  )
294
 
295
+ show_json = st.checkbox("Show JSON Output", value=True)
296
+
297
  if st.button("Process Text"):
298
  texts = [text.strip() for text in text_input.split(",") if text.strip()]
299
+ model = PIIMaskingModel()
300
+
301
+ for text in texts:
302
+ st.markdown(
303
+ '<h3 style="text-align: center;">Original Text</h3>',
304
+ unsafe_allow_html=True,
305
+ )
306
+ st.markdown(f'<div class="rtl">{text}</div>', unsafe_allow_html=True)
307
+
308
+ (
309
+ masked_text,
310
+ processing_time,
311
+ colored_text,
312
+ tokens,
313
+ predicted_labels,
314
+ privacy_masks,
315
+ ) = model.process_text(text)
316
+
317
+ st.markdown(
318
+ '<h3 style="text-align: center;">Masked Text</h3>',
319
+ unsafe_allow_html=True,
320
+ )
321
+ st.markdown(
322
+ f'<div class="masked-text">{colored_text}</div>', unsafe_allow_html=True
323
+ )
324
+
325
+ st.markdown(f"Processing Time: {processing_time:.3f} seconds")
326
+
327
+ if show_json:
328
+ st.json(
329
+ {
330
  "original": text,
331
  "masked": masked_text,
332
  "processing_time": processing_time,
333
+ "tokens": tokens,
334
+ "token_classes": predicted_labels,
335
  "privacy_mask": privacy_masks,
336
  "span_labels": [
337
  [m["start"], m["end"], m["label"]] for m in privacy_masks
338
  ],
339
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
  )
341
 
342
+ st.markdown("---")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
 
344
 
345
  if __name__ == "__main__":