Spaces:

CordwainerSmith
/

GolemPII

Sleeping

App Files Files Community

cordwainersmith commited on Oct 28, 2024

Commit

277ab09

1 Parent(s): 01d7fe4

Add application file

Browse files

Files changed (1) hide show

app.py +448 -0

app.py ADDED Viewed

	@@ -0,0 +1,448 @@

+import streamlit as st
+import torch
+from transformers import AutoTokenizer, AutoModelForTokenClassification
+import time
+import json
+import pandas as pd
+from datetime import datetime
+import os
+from typing import List, Dict, Tuple
+import re
+# Constants
+MODELS = {
+    "GolemPII XLM-RoBERTa v1": "CordwainerSmith/GolemPII-xlm-roberta-v1",
+}
+ENTITY_COLORS = {
+    "PHONE_NUM": "#FF9999",
+    "ID_NUM": "#99FF99",
+    "CC_NUM": "#9999FF",
+    "BANK_ACCOUNT_NUM": "#FFFF99",
+    "FIRST_NAME": "#FF99FF",
+    "LAST_NAME": "#99FFFF",
+    "CITY": "#FFB366",
+    "STREET": "#B366FF",
+    "POSTAL_CODE": "#66FFB3",
+    "EMAIL": "#66B3FF",
+    "DATE": "#FFB3B3",
+    "CC_PROVIDER": "#B3FFB3",
+}
+EXAMPLE_SENTENCES = [
+    "שם מלא: תלמה אריאלי מספר תעודת זהות: 61453324-8 תאריך לידה: 15/09/1983 כתובת:  ארלוזורוב 22  פתח תקווה מיקוד 2731711 אימייל: [email protected] טלפון: 054-8884771  בפגישה זו נדונו פתרונות טכנולוגיים חדשניים לשיפור תהליכי עבודה. המשתתף יתבקש להציג מצגת בנושא בפגישה הבאה אשר שילם ב 5326-1003-5299-5478 מסטרקארד עם הוראת קבע ל 11-77-352300",
+]
+MODEL_DETAILS = {
+    "name": "GolemPII - Hebrew PII Detection Model CordwainerSmith/GolemPII-v7-full",
+    "description": "This on-premise PII model is designed to automatically identify and mask sensitive information (PII) within Hebrew text data. It has been trained to recognize a wide range of PII entities, including names, addresses, phone numbers, financial information, and more.",
+    "base_model": "microsoft/mdeberta-v3-base",
+    "training_data": "Custom Hebrew PII dataset (size not specified)",
+    "detected_pii_entities": [
+        "FIRST_NAME",
+        "LAST_NAME",
+        "STREET",
+        "CITY",
+        "PHONE_NUM",
+        "EMAIL",
+        "ID_NUM",
+        "BANK_ACCOUNT_NUM",
+        "CC_NUM",
+        "CC_PROVIDER",
+        "DATE",
+        "POSTAL_CODE",
+    ],
+    "training_details": {
+        "Training epochs": "5",
+        "Batch size": "32",
+        "Learning rate": "5e-5",
+        "Weight decay": "0.01",
+        "Training speed": "~2.19 it/s",
+        "Total training time": "2:08:26",
+    },
+}
+class PIIMaskingModel:
+    def __init__(self, model_name: str):
+        self.model_name = model_name
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
+        self.model = AutoModelForTokenClassification.from_pretrained(
+            model_name, token=HF_TOKEN
+        )
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model.to(self.device)
+        self.model.eval()
+    def process_text(
+        self, text: str
+    ) -> Tuple[str, float, str, List[str], List[str], List[Dict]]:
+        start_time = time.time()
+        tokenized_inputs = self.tokenizer(
+            text,
+            truncation=True,
+            padding=False,
+            return_tensors="pt",
+            return_offsets_mapping=True,
+            add_special_tokens=True,
+        )
+        input_ids = tokenized_inputs.input_ids.to(self.device)
+        attention_mask = tokenized_inputs.attention_mask.to(self.device)
+        offset_mapping = tokenized_inputs["offset_mapping"][0].tolist()
+        # Handle special tokens
+        offset_mapping[0] = None  # <s> token
+        offset_mapping[-1] = None  # </s> token
+        with torch.no_grad():
+            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
+        predictions = outputs.logits.argmax(dim=-1).cpu().numpy()
+        predicted_labels = [
+            self.model.config.id2label[label_id] for label_id in predictions[0]
+        ]
+        tokens = self.tokenizer.convert_ids_to_tokens(input_ids[0])
+        masked_text, colored_text, privacy_masks = self.mask_pii_in_sentence(
+            tokens, predicted_labels, text, offset_mapping
+        )
+        processing_time = time.time() - start_time
+        return (
+            masked_text,
+            processing_time,
+            colored_text,
+            tokens,
+            predicted_labels,
+            privacy_masks,
+        )
+    def _find_entity_span(
+        self,
+        i: int,
+        labels: List[str],
+        tokens: List[str],
+        offset_mapping: List[Tuple[int, int]],
+    ) -> Tuple[int, str, int]:
+        """Find the end index and entity type for a span starting at index i"""
+        current_entity = labels[i][2:] if labels[i].startswith("B-") else labels[i][2:]
+        j = i + 1
+        last_valid_end = offset_mapping[i][1] if offset_mapping[i] else None
+        while j < len(tokens):
+            if offset_mapping[j] is None:
+                j += 1
+                continue
+            next_label = labels[j]
+            # Stop if we hit a new B- tag (except for non-spaced tokens)
+            if next_label.startswith("B-") and tokens[j].startswith("▁"):
+                break
+            # Stop if we hit a different entity type in I- tags
+            if next_label.startswith("I-") and next_label[2:] != current_entity:
+                break
+            # Continue if it's a continuation of the same entity
+            if next_label.startswith("I-") and next_label[2:] == current_entity:
+                last_valid_end = offset_mapping[j][1]
+                j += 1
+            # Continue if it's a non-spaced B- token
+            elif next_label.startswith("B-") and not tokens[j].startswith("▁"):
+                last_valid_end = offset_mapping[j][1]
+                j += 1
+            else:
+                break
+        return j, current_entity, last_valid_end
+    def mask_pii_in_sentence(
+        self,
+        tokens: List[str],
+        labels: List[str],
+        original_text: str,
+        offset_mapping: List[Tuple[int, int]],
+    ) -> Tuple[str, str, List[Dict]]:
+        privacy_masks = []
+        current_pos = 0
+        masked_text_parts = []
+        colored_text_parts = []
+        i = 0
+        while i < len(tokens):
+            if offset_mapping[i] is None:  # Skip special tokens
+                i += 1
+                continue
+            current_label = labels[i]
+            if current_label.startswith(("B-", "I-")):
+                start_char = offset_mapping[i][0]
+                # Find the complete entity span
+                next_pos, entity_type, last_valid_end = self._find_entity_span(
+                    i, labels, tokens, offset_mapping
+                )
+                # Add any text before the entity
+                if current_pos < start_char:
+                    text_before = original_text[current_pos:start_char]
+                    masked_text_parts.append(text_before)
+                    colored_text_parts.append(text_before)
+                # Extract and mask the entity
+                entity_value = original_text[start_char:last_valid_end]
+                mask = self._get_mask_for_entity(entity_type)
+                # Add to privacy masks
+                privacy_masks.append(
+                    {
+                        "label": entity_type,
+                        "start": start_char,
+                        "end": last_valid_end,
+                        "value": entity_value,
+                        "label_index": len(privacy_masks) + 1,
+                    }
+                )
+                # Add masked text
+                masked_text_parts.append(mask)
+                # Add colored text
+                color = ENTITY_COLORS.get(entity_type, "#CCCCCC")
+                colored_text_parts.append(
+                    f'<span style="background-color: {color}; padding: 2px; border-radius: 3px;">{mask}</span>'
+                )
+                current_pos = last_valid_end
+                i = next_pos
+            else:
+                if offset_mapping[i] is not None:
+                    start_char = offset_mapping[i][0]
+                    end_char = offset_mapping[i][1]
+                    # Add any text for this token
+                    if current_pos < end_char:
+                        text_chunk = original_text[current_pos:end_char]
+                        masked_text_parts.append(text_chunk)
+                        colored_text_parts.append(text_chunk)
+                        current_pos = end_char
+                i += 1
+        # Add any remaining text
+        if current_pos < len(original_text):
+            remaining_text = original_text[current_pos:]
+            masked_text_parts.append(remaining_text)
+            colored_text_parts.append(remaining_text)
+        return ("".join(masked_text_parts), "".join(colored_text_parts), privacy_masks)
+    def _get_mask_for_entity(self, entity_type: str) -> str:
+        """Get the mask text for a given entity type"""
+        return {
+            "PHONE_NUM": "[טלפון]",
+            "ID_NUM": "[ת.ז]",
+            "CC_NUM": "[כרטיס אשראי]",
+            "BANK_ACCOUNT_NUM": "[חשבון בנק]",
+            "FIRST_NAME": "[שם פרטי]",
+            "LAST_NAME": "[שם משפחה]",
+            "CITY": "[עיר]",
+            "STREET": "[רחוב]",
+            "POSTAL_CODE": "[מיקוד]",
+            "EMAIL": "[אימייל]",
+            "DATE": "[תאריך]",
+            "CC_PROVIDER": "[ספק כרטיס אשראי]",
+            "BANK": "[בנק]",
+        }.get(entity_type, f"[{entity_type}]")
+def save_results_to_file(results: Dict):
+    """
+    Save processing results to a JSON file
+    """
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    filename = f"pii_masking_results_{timestamp}.json"
+    with open(filename, "w", encoding="utf-8") as f:
+        json.dump(results, f, ensure_ascii=False, indent=2)
+    return filename
+def main():
+    st.set_page_config(layout="wide")
+    st.title("🗿 GolemPII: Hebrew PII Masking Application 🗿")
+    # Add CSS styles
+    st.markdown(
+        """
+    <style>
+        .rtl { direction: rtl; text-align: right; }
+        .entity-legend { padding: 5px; margin: 2px; border-radius: 3px; display: inline-block; }
+        .masked-text {
+            direction: rtl;
+            text-align: right;
+            line-height: 2;
+            padding: 10px;
+            background-color: #f6f8fa;
+            border-radius: 5px;
+            color: black;
+            white-space: pre-wrap;
+        }
+        /* Red headers for sections */
+        .main h3 {
+            color: #d73a49;
+            margin-bottom: 10px;
+        }
+        /* Styles for the model details sidebar */
+        .model-details-sidebar h2 {
+            margin-top: 0;
+        }
+        .model-details-sidebar table {
+            width: 100%;
+            border-collapse: collapse;
+        }
+        .model-details-sidebar td, .model-details-sidebar th {
+            padding: 8px;
+            border: 1px solid #ddd;
+            text-align: left;
+        }
+    </style>
+    """,
+        unsafe_allow_html=True,
+    )
+    # Sidebar configuration
+    st.sidebar.header("Configuration")
+    selected_model = st.sidebar.selectbox("Select Model", list(MODELS.keys()))
+    show_json = st.sidebar.checkbox("Show JSON Output", value=True)
+    run_all_models = st.sidebar.checkbox("Run All Models")
+    # Display Model Details in Sidebar
+    st.sidebar.markdown(
+        f"""
+        <div class="model-details-sidebar">
+            <h2>Model Details: {MODEL_DETAILS['name']}</h2>
+            <p>{MODEL_DETAILS['description']}</p>
+            <table>
+                <tr><td>Base Model:</td><td>{MODEL_DETAILS['base_model']}</td></tr>
+                <tr><td>Training Data:</td><td>{MODEL_DETAILS['training_data']}</td></tr>
+            </table>
+            <h3>Detected PII Entities</h3>
+            <ul>
+                {" ".join([f'<li><span class="entity-badge" style="background-color: {ENTITY_COLORS.get(entity, "#CCCCCC")}; padding: 3px 5px; border-radius: 3px; margin-right: 5px;">{entity}</span></li>' for entity in MODEL_DETAILS['detected_pii_entities']])}
+            </ul>
+        </div>
+        """,
+        unsafe_allow_html=True,
+    )
+    # Text input
+    text_input = st.text_area(
+        "Enter text to mask (separate multiple texts with commas):",
+        value="\n".join(EXAMPLE_SENTENCES),
+        height=200,
+    )
+    # Process button
+    if st.button("Process Text"):
+        texts = [text.strip() for text in text_input.split(",") if text.strip()]
+        if run_all_models:
+            all_results = {}
+            progress_bar = st.progress(0)
+            for idx, (model_name, model_path) in enumerate(MODELS.items()):
+                st.subheader(f"Results for {model_name}")
+                model = PIIMaskingModel(model_path)
+                model_results = {}
+                for text_idx, text in enumerate(texts):
+                    (
+                        masked_text,
+                        processing_time,
+                        colored_text,
+                        tokens,
+                        predicted_labels,
+                        privacy_masks,
+                    ) = model.process_text(text)
+                    model_results[f"text_{text_idx+1}"] = {
+                        "original": text,
+                        "masked": masked_text,
+                        "processing_time": processing_time,
+                        "privacy_mask": privacy_masks,
+                        "span_labels": [
+                            [m["start"], m["end"], m["label"]] for m in privacy_masks
+                        ],
+                    }
+                all_results[model_name] = model_results
+                progress_bar.progress((idx + 1) / len(MODELS))
+            # Save and display results
+            filename = save_results_to_file(all_results)
+            st.success(f"Results saved to {filename}")
+            # Show comparison table
+            comparison_data = []
+            for model_name, results in all_results.items():
+                avg_time = sum(
+                    text_data["processing_time"] for text_data in results.values()
+                ) / len(results)
+                comparison_data.append(
+                    {"Model": model_name, "Avg Processing Time": f"{avg_time:.3f}s"}
+                )
+            st.subheader("Model Comparison")
+            st.table(pd.DataFrame(comparison_data))
+        else:
+            # Process with single selected model
+            model = PIIMaskingModel(MODELS[selected_model])
+            for text in texts:
+                st.markdown("### Original Text", unsafe_allow_html=True)
+                st.markdown(f'<div class="rtl">{text}</div>', unsafe_allow_html=True)
+                (
+                    masked_text,
+                    processing_time,
+                    colored_text,
+                    tokens,
+                    predicted_labels,
+                    privacy_masks,
+                ) = model.process_text(text)
+                st.markdown("### Masked Text", unsafe_allow_html=True)
+                st.markdown(
+                    f'<div class="masked-text">{colored_text}</div>',
+                    unsafe_allow_html=True,
+                )
+                st.markdown(f"Processing Time: {processing_time:.3f} seconds")
+                if show_json:
+                    st.json(
+                        {
+                            "original": text,
+                            "masked": masked_text,
+                            "processing_time": processing_time,
+                            "tokens": tokens,
+                            "token_classes": predicted_labels,
+                            "privacy_mask": privacy_masks,
+                            "span_labels": [
+                                [m["start"], m["end"], m["label"]]
+                                for m in privacy_masks
+                            ],
+                        }
+                    )
+                st.markdown("---")
+if __name__ == "__main__":
+    main()