Spaces:

CordwainerSmith
/

GolemPII

Sleeping

File size: 15,979 Bytes

import streamlit as st
from transformers import AutoTokenizer, AutoModelForTokenClassification
import time
import json
import pandas as pd
from datetime import datetime
import os
from typing import List, Dict, Tuple
import re

# Constants
MODELS = {
    "GolemPII XLM-RoBERTa v1": "CordwainerSmith/GolemPII-xlm-roberta-v1",
}


ENTITY_COLORS = {
    "PHONE_NUM": "#FF9999",
    "ID_NUM": "#99FF99",
    "CC_NUM": "#9999FF",
    "BANK_ACCOUNT_NUM": "#FFFF99",
    "FIRST_NAME": "#FF99FF",
    "LAST_NAME": "#99FFFF",
    "CITY": "#FFB366",
    "STREET": "#B366FF",
    "POSTAL_CODE": "#66FFB3",
    "EMAIL": "#66B3FF",
    "DATE": "#FFB3B3",
    "CC_PROVIDER": "#B3FFB3",
}

EXAMPLE_SENTENCES = [
    "שם מלא: תלמה אריאלי מספר תעודת זהות: 61453324-8 תאריך לידה: 15/09/1983 כתובת:  ארלוזורוב 22  פתח תקווה מיקוד 2731711 אימייל: [email protected] טלפון: 054-8884771  בפגישה זו נדונו פתרונות טכנולוגיים חדשניים לשיפור תהליכי עבודה. המשתתף יתבקש להציג מצגת בנושא בפגישה הבאה אשר שילם ב 5326-1003-5299-5478 מסטרקארד עם הוראת קבע ל 11-77-352300",
]

MODEL_DETAILS = {
    "name": "GolemPII - Hebrew PII Detection Model CordwainerSmith/GolemPII-v7-full",
    "description": "This on-premise PII model is designed to automatically identify and mask sensitive information (PII) within Hebrew text data. It has been trained to recognize a wide range of PII entities, including names, addresses, phone numbers, financial information, and more.",
    "base_model": "microsoft/mdeberta-v3-base",
    "training_data": "Custom Hebrew PII dataset (size not specified)",
    "detected_pii_entities": [
        "FIRST_NAME",
        "LAST_NAME",
        "STREET",
        "CITY",
        "PHONE_NUM",
        "EMAIL",
        "ID_NUM",
        "BANK_ACCOUNT_NUM",
        "CC_NUM",
        "CC_PROVIDER",
        "DATE",
        "POSTAL_CODE",
    ],
    "training_details": {
        "Training epochs": "5",
        "Batch size": "32",
        "Learning rate": "5e-5",
        "Weight decay": "0.01",
        "Training speed": "~2.19 it/s",
        "Total training time": "2:08:26",
    },
}


class PIIMaskingModel:
    def __init__(self, model_name: str):
        self.model_name = model_name
        hf_token = st.secrets["HF_TOKEN"]  # Retrieve the token from secrets
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name, use_auth_token=hf_token
        )
        self.model = AutoModelForTokenClassification.from_pretrained(
            model_name, use_auth_token=hf_token
        )

    def process_text(
        self, text: str
    ) -> Tuple[str, float, str, List[str], List[str], List[Dict]]:
        start_time = time.time()

        tokenized_inputs = self.tokenizer(
            text,
            truncation=True,
            padding=False,
            return_tensors="np",  # Return NumPy arrays for CPU
            return_offsets_mapping=True,
            add_special_tokens=True,
        )

        input_ids = tokenized_inputs.input_ids
        attention_mask = tokenized_inputs.attention_mask
        offset_mapping = tokenized_inputs["offset_mapping"][0].tolist()

        # Handle special tokens
        offset_mapping[0] = None  # <s> token
        offset_mapping[-1] = None  # </s> token

        # No need for torch.no_grad() as we are not using gradients
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

        predictions = outputs.logits.argmax(dim=-1)  # No need to move to CPU
        predicted_labels = [
            self.model.config.id2label[label_id] for label_id in predictions[0]
        ]
        tokens = self.tokenizer.convert_ids_to_tokens(input_ids[0])

        masked_text, colored_text, privacy_masks = self.mask_pii_in_sentence(
            tokens, predicted_labels, text, offset_mapping
        )
        processing_time = time.time() - start_time

        return (
            masked_text,
            processing_time,
            colored_text,
            tokens,
            predicted_labels,
            privacy_masks,
        )

    def _find_entity_span(
        self,
        i: int,
        labels: List[str],
        tokens: List[str],
        offset_mapping: List[Tuple[int, int]],
    ) -> Tuple[int, str, int]:
        """Find the end index and entity type for a span starting at index i"""
        current_entity = labels[i][2:] if labels[i].startswith("B-") else labels[i][2:]
        j = i + 1
        last_valid_end = offset_mapping[i][1] if offset_mapping[i] else None

        while j < len(tokens):
            if offset_mapping[j] is None:
                j += 1
                continue

            next_label = labels[j]

            # Stop if we hit a new B- tag (except for non-spaced tokens)
            if next_label.startswith("B-") and tokens[j].startswith(" "):
                break

            # Stop if we hit a different entity type in I- tags
            if next_label.startswith("I-") and next_label[2:] != current_entity:
                break

            # Continue if it's a continuation of the same entity
            if next_label.startswith("I-") and next_label[2:] == current_entity:
                last_valid_end = offset_mapping[j][1]
                j += 1
            # Continue if it's a non-spaced B- token
            elif next_label.startswith("B-") and not tokens[j].startswith(" "):
                last_valid_end = offset_mapping[j][1]
                j += 1
            else:
                break

        return j, current_entity, last_valid_end

    def mask_pii_in_sentence(
        self,
        tokens: List[str],
        labels: List[str],
        original_text: str,
        offset_mapping: List[Tuple[int, int]],
    ) -> Tuple[str, str, List[Dict]]:
        privacy_masks = []
        current_pos = 0
        masked_text_parts = []
        colored_text_parts = []

        i = 0
        while i < len(tokens):
            if offset_mapping[i] is None:  # Skip special tokens
                i += 1
                continue

            current_label = labels[i]

            if current_label.startswith(("B-", "I-")):
                start_char = offset_mapping[i][0]

                # Find the complete entity span
                next_pos, entity_type, last_valid_end = self._find_entity_span(
                    i, labels, tokens, offset_mapping
                )

                # Add any text before the entity
                if current_pos < start_char:
                    text_before = original_text[current_pos:start_char]
                    masked_text_parts.append(text_before)
                    colored_text_parts.append(text_before)

                # Extract and mask the entity
                entity_value = original_text[start_char:last_valid_end]
                mask = self._get_mask_for_entity(entity_type)

                # Add to privacy masks
                privacy_masks.append(
                    {
                        "label": entity_type,
                        "start": start_char,
                        "end": last_valid_end,
                        "value": entity_value,
                        "label_index": len(privacy_masks) + 1,
                    }
                )

                # Add masked text
                masked_text_parts.append(mask)

                # Add colored text
                color = ENTITY_COLORS.get(entity_type, "#CCCCCC")
                colored_text_parts.append(
                    f'<span style="background-color: {color}; padding: 2px; border-radius: 3px;">{mask}</span>'
                )

                current_pos = last_valid_end
                i = next_pos
            else:
                if offset_mapping[i] is not None:
                    start_char = offset_mapping[i][0]
                    end_char = offset_mapping[i][1]

                    # Add any text for this token
                    if current_pos < end_char:
                        text_chunk = original_text[current_pos:end_char]
                        masked_text_parts.append(text_chunk)
                        colored_text_parts.append(text_chunk)
                        current_pos = end_char
                i += 1

        # Add any remaining text
        if current_pos < len(original_text):
            remaining_text = original_text[current_pos:]
            masked_text_parts.append(remaining_text)
            colored_text_parts.append(remaining_text)

        return ("".join(masked_text_parts), "".join(colored_text_parts), privacy_masks)

    def _get_mask_for_entity(self, entity_type: str) -> str:
        """Get the mask text for a given entity type"""
        return {
            "PHONE_NUM": "[טלפון]",
            "ID_NUM": "[ת.ז]",
            "CC_NUM": "[כרטיס אשראי]",
            "BANK_ACCOUNT_NUM": "[חשבון בנק]",
            "FIRST_NAME": "[שם פרטי]",
            "LAST_NAME": "[שם משפחה]",
            "CITY": "[עיר]",
            "STREET": "[רחוב]",
            "POSTAL_CODE": "[מיקוד]",
            "EMAIL": "[אימייל]",
            "DATE": "[תאריך]",
            "CC_PROVIDER": "[ספק כרטיס אשראי]",
            "BANK": "[בנק]",
        }.get(entity_type, f"[{entity_type}]")


def save_results_to_file(results: Dict):
    """
    Save processing results to a JSON file
    """
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"pii_masking_results_{timestamp}.json"

    with open(filename, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    return filename


def main():
    st.set_page_config(layout="wide")
    st.title("🗿 GolemPII: Hebrew PII Masking Application 🗿")

    # Add CSS styles
    st.markdown(
        """
    <style>
        .rtl { direction: rtl; text-align: right; }
        .entity-legend { padding: 5px; margin: 2px; border-radius: 3px; display: inline-block; }
        .masked-text { 
            direction: rtl; 
            text-align: right; 
            line-height: 2; 
            padding: 10px; 
            background-color: #f6f8fa; 
            border-radius: 5px; 
            color: black;             
            white-space: pre-wrap;
        }
        /* Red headers for sections */
        .main h3 {
            color: #d73a49;
            margin-bottom: 10px;
        }
        /* Styles for the model details sidebar */
        .model-details-sidebar h2 {
            margin-top: 0;
        }
        .model-details-sidebar table {
            width: 100%;
            border-collapse: collapse;
        }
        .model-details-sidebar td, .model-details-sidebar th {
            padding: 8px;
            border: 1px solid #ddd;
            text-align: left;
        }
    </style>
    """,
        unsafe_allow_html=True,
    )

    # Sidebar configuration
    st.sidebar.header("Configuration")
    selected_model = st.sidebar.selectbox("Select Model", list(MODELS.keys()))
    show_json = st.sidebar.checkbox("Show JSON Output", value=True)
    run_all_models = st.sidebar.checkbox("Run All Models")

    # Display Model Details in Sidebar
    st.sidebar.markdown(
        f"""
        <div class="model-details-sidebar">
            <h2>Model Details: {MODEL_DETAILS['name']}</h2>
            <p>{MODEL_DETAILS['description']}</p>
            <table>
                <tr><td>Base Model:</td><td>{MODEL_DETAILS['base_model']}</td></tr>
                <tr><td>Training Data:</td><td>{MODEL_DETAILS['training_data']}</td></tr>            
            </table>
            <h3>Detected PII Entities</h3>
            <ul>
                {" ".join([f'<li><span class="entity-badge" style="background-color: {ENTITY_COLORS.get(entity, "#CCCCCC")}; padding: 3px 5px; border-radius: 3px; margin-right: 5px;">{entity}</span></li>' for entity in MODEL_DETAILS['detected_pii_entities']])}
            </ul>
        </div>
        """,
        unsafe_allow_html=True,
    )

    # Text input
    text_input = st.text_area(
        "Enter text to mask (separate multiple texts with commas):",
        value="\n".join(EXAMPLE_SENTENCES),
        height=200,
    )

    # Process button
    if st.button("Process Text"):
        texts = [text.strip() for text in text_input.split(",") if text.strip()]

        if run_all_models:
            all_results = {}
            progress_bar = st.progress(0)

            for idx, (model_name, model_path) in enumerate(MODELS.items()):
                st.subheader(f"Results for {model_name}")
                model = PIIMaskingModel(model_path)
                model_results = {}

                for text_idx, text in enumerate(texts):
                    (
                        masked_text,
                        processing_time,
                        colored_text,
                        tokens,
                        predicted_labels,
                        privacy_masks,
                    ) = model.process_text(text)
                    model_results[f"text_{text_idx+1}"] = {
                        "original": text,
                        "masked": masked_text,
                        "processing_time": processing_time,
                        "privacy_mask": privacy_masks,
                        "span_labels": [
                            [m["start"], m["end"], m["label"]] for m in privacy_masks
                        ],
                    }

                all_results[model_name] = model_results
                progress_bar.progress((idx + 1) / len(MODELS))

            # Save and display results
            filename = save_results_to_file(all_results)
            st.success(f"Results saved to {filename}")

            # Show comparison table
            comparison_data = []
            for model_name, results in all_results.items():
                avg_time = sum(
                    text_data["processing_time"] for text_data in results.values()
                ) / len(results)
                comparison_data.append(
                    {"Model": model_name, "Avg Processing Time": f"{avg_time:.3f}s"}
                )

            st.subheader("Model Comparison")
            st.table(pd.DataFrame(comparison_data))

        else:
            # Process with single selected model
            model = PIIMaskingModel(MODELS[selected_model])

            for text in texts:
                st.markdown("### Original Text", unsafe_allow_html=True)
                st.markdown(f'<div class="rtl">{text}</div>', unsafe_allow_html=True)

                (
                    masked_text,
                    processing_time,
                    colored_text,
                    tokens,
                    predicted_labels,
                    privacy_masks,
                ) = model.process_text(text)

                st.markdown("### Masked Text", unsafe_allow_html=True)
                st.markdown(
                    f'<div class="masked-text">{colored_text}</div>',
                    unsafe_allow_html=True,
                )

                st.markdown(f"Processing Time: {processing_time:.3f} seconds")

                if show_json:
                    st.json(
                        {
                            "original": text,
                            "masked": masked_text,
                            "processing_time": processing_time,
                            "tokens": tokens,
                            "token_classes": predicted_labels,
                            "privacy_mask": privacy_masks,
                            "span_labels": [
                                [m["start"], m["end"], m["label"]]
                                for m in privacy_masks
                            ],
                        }
                    )

                st.markdown("---")


if __name__ == "__main__":
    main()