structured-data-anonymizer

Runtime error

File size: 8,180 Bytes


"""Streamlit app for Presidio + Privy-trained PII models."""

import spacy
from spacy_recognizer import CustomSpacyRecognizer
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
import pandas as pd
from annotated_text import annotated_text
from json import JSONEncoder
import json
import warnings
import streamlit as st
import os
import csv
os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings('ignore')
# from flair_recognizer import FlairRecognizer

def load_data(file_location):
    unpacked_string_data = []
    unpacked_url_data = []
    unpacked_json_data = []
    # Read the data back from the CSV file and unpack it
    with open(file_location, mode='r') as csv_file:
        reader = csv.reader(csv_file)
        for row in reader:
            unpacked_string_data.append(row[0])
            unpacked_url_data.append(row[1])
            unpacked_json_data.append(json.loads(row[2]))
            # print("Unpacked string data:", unpacked_string_data)
            # print("Unpacked url data:", unpacked_url_data)
            # print("Unpacked JSON data:", unpacked_json_data)
    return unpacked_string_data, unpacked_url_data, unpacked_json_data

# Helper methods
@st.cache(allow_output_mutation=True)
def analyzer_engine():
    """Return AnalyzerEngine."""

    spacy_recognizer = CustomSpacyRecognizer()

    configuration = {
        "nlp_engine_name": "spacy",
        "models": [
            {"lang_code": "en", "model_name": "en_spacy_pii_distilbert"}],
    }

    # Create NLP engine based on configuration
    provider = NlpEngineProvider(nlp_configuration=configuration)
    nlp_engine = provider.create_engine()

    registry = RecognizerRegistry()
    # add rule-based recognizers
    registry.load_predefined_recognizers(nlp_engine=nlp_engine)
    registry.add_recognizer(spacy_recognizer)
    # remove the nlp engine we passed, to use custom label mappings
    registry.remove_recognizer("SpacyRecognizer")

    analyzer = AnalyzerEngine(nlp_engine=nlp_engine,
                              registry=registry, supported_languages=["en"])

    # uncomment for flair-based NLP recognizer
    # flair_recognizer = FlairRecognizer()
    # registry.load_predefined_recognizers()
    # registry.add_recognizer(flair_recognizer)
    # analyzer = AnalyzerEngine(registry=registry, supported_languages=["en"])
    return analyzer


@st.cache(allow_output_mutation=True)
def anonymizer_engine():
    """Return AnonymizerEngine."""
    return AnonymizerEngine()


def get_supported_entities():
    """Return supported entities from the Analyzer Engine."""
    return analyzer_engine().get_supported_entities()


def analyze(**kwargs):
    """Analyze input using Analyzer engine and input arguments (kwargs)."""
    if "entities" not in kwargs or "All" in kwargs["entities"]:
        kwargs["entities"] = None
    return analyzer_engine().analyze(**kwargs)


def anonymize(text, analyze_results):
    """Anonymize identified input using Presidio Abonymizer."""
    if not text:
        return
    res = anonymizer_engine().anonymize(text, analyze_results)
    return res.text


def annotate(text, st_analyze_results, st_entities):
    tokens = []
    # sort by start index
    results = sorted(st_analyze_results, key=lambda x: x.start)
    for i, res in enumerate(results):
        if i == 0:
            tokens.append(text[:res.start])

        # append entity text and entity type
        tokens.append((text[res.start: res.end], res.entity_type))

        # if another entity coming i.e. we're not at the last results element, add text up to next entity
        if i != len(results) - 1:
            tokens.append(text[res.end:results[i+1].start])
        # if no more entities coming, add all remaining text
        else:
            tokens.append(text[res.end:])
    return tokens


st.set_page_config(page_title="Bitahoy demo", layout="wide")

# Side bar
# add picture with
st.sidebar.image("assets/bitahoy-logo.png", width=200)

st_entities = st.sidebar.multiselect(
    label="Which entities to look for?",
    options=get_supported_entities(),
    default=list(get_supported_entities()),
)

st_threshold = st.sidebar.slider(
    label="Acceptance threshold", min_value=0.0, max_value=1.0, value=0.35
)

st_return_decision_process = st.sidebar.checkbox(
    "Add analysis explanations in json")

st.sidebar.markdown(
    """
Detect and anonymize PII in text using an [NLP model](https://huggingface.co/beki/en_spacy_pii_distilbert) trained on protocol traces (JSON, SQL, XML etc.) generated by 
[Privy](https://github.com/pixie-io/pixie/tree/main/src/datagen/pii/privy) and rule-based classifiers from [Presidio](https://aka.ms/presidio).
"""
)

st.sidebar.info(
    "Privy is an open source framework for synthetic data generation in protocol trace formats (json, sql, html etc). Presidio is an open source framework for PII detection and anonymization. "
    "For more info visit [privy](https://github.com/pixie-io/pixie/tree/main/src/datagen/pii/privy) and [aka.ms/presidio](https://aka.ms/presidio)"
)


# Main panel
analyzer_load_state = st.info(
    "Starting analyzer and loading model...")
engine = analyzer_engine()
analyzer_load_state.empty()

# col?
# Store the initial value of widgets in session state
if "visibility" not in st.session_state:
    st.session_state.visibility = "visible"
    st.session_state.disabled = False

col1, col2 = st.columns(2)

with col1:
    # st.radio(
    #     "Set selectbox label visibility 👉",
    #     key="visibility",
    #     options=["visible", "hidden", "collapsed"],
    # )
    st_text = st.text_area(
        label="Type in some text",
        value="SELECT shipping FROM users WHERE shipping = '201 Thayer St Providence RI 02912'"
              "\n\n"
              "{user: Willie Porter, ip: 192.168.2.80, email: [email protected]}",
        height=200,
    )

with col2:
    st.checkbox("Enable/Disable selectbox widget", key="disabled")
    titles, urls, jsons = load_data("assets/data_sorted.csv")
    option_list = titles
    option = st.selectbox(
        "How would you like to be contacted?",
        option_list,
        # label_visibility=st.session_state.visibility,
        disabled=st.session_state.disabled,
    )
    st.write('You selected:', option)

# end of col
button = st.button("Detect PII")

if 'first_load' not in st.session_state:
    st.session_state['first_load'] = True

# After
st.subheader("Analyzed")
with st.spinner("Analyzing..."):
    if button or st.session_state.first_load:
        st_analyze_results = analyze(
            text=st_text,
            entities=st_entities,
            language="en",
            score_threshold=st_threshold,
            return_decision_process=st_return_decision_process,
        )
        annotated_tokens = annotate(st_text, st_analyze_results, st_entities)
        # annotated_tokens
        annotated_text(*annotated_tokens)
# vertical space
st.text("")

st.subheader("Anonymized")

with st.spinner("Anonymizing..."):
    if button or st.session_state.first_load:
        st_anonymize_results = anonymize(st_text, st_analyze_results)
        st_anonymize_results


# table result
st.subheader("Detailed Findings")
if st_analyze_results:
    res_dicts = [r.to_dict() for r in st_analyze_results]
    for d in res_dicts:
        d['Value'] = st_text[d['start']:d['end']]
    df = pd.DataFrame.from_records(res_dicts)
    df = df[["entity_type", "Value", "score", "start", "end"]].rename(
        {
            "entity_type": "Entity type",
            "start": "Start",
            "end": "End",
            "score": "Confidence",
        },
        axis=1,
    )

    st.dataframe(df, width=1000)
else:
    st.text("No findings")

st.session_state['first_load'] = True

# json result


class ToDictListEncoder(JSONEncoder):
    """Encode dict to json."""

    def default(self, o):
        """Encode to JSON using to_dict."""
        if o:
            return o.to_dict()
        return []


if st_return_decision_process:
    st.json(json.dumps(st_analyze_results, cls=ToDictListEncoder))