structured-data-anonymizer

Runtime error

rotaba

now data proper and upacking should work

0d986e2 over 2 years ago

8.18 kB


	"""Streamlit app for Presidio + Privy-trained PII models."""

	import spacy
	from spacy_recognizer import CustomSpacyRecognizer
	from presidio_analyzer.nlp_engine import NlpEngineProvider
	from presidio_anonymizer import AnonymizerEngine
	from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
	import pandas as pd
	from annotated_text import annotated_text
	from json import JSONEncoder
	import json
	import warnings
	import streamlit as st
	import os
	import csv
	os.environ["TOKENIZERS_PARALLELISM"] = "false"
	warnings.filterwarnings('ignore')
	# from flair_recognizer import FlairRecognizer

	def load_data(file_location):
	unpacked_string_data = []
	unpacked_url_data = []
	unpacked_json_data = []
	# Read the data back from the CSV file and unpack it
	with open(file_location, mode='r') as csv_file:
	reader = csv.reader(csv_file)
	for row in reader:
	unpacked_string_data.append(row[0])
	unpacked_url_data.append(row[1])
	unpacked_json_data.append(json.loads(row[2]))
	# print("Unpacked string data:", unpacked_string_data)
	# print("Unpacked url data:", unpacked_url_data)
	# print("Unpacked JSON data:", unpacked_json_data)
	return unpacked_string_data, unpacked_url_data, unpacked_json_data

	# Helper methods
	@st.cache(allow_output_mutation=True)
	def analyzer_engine():
	"""Return AnalyzerEngine."""

	spacy_recognizer = CustomSpacyRecognizer()

	configuration = {
	"nlp_engine_name": "spacy",
	"models": [
	{"lang_code": "en", "model_name": "en_spacy_pii_distilbert"}],
	}

	# Create NLP engine based on configuration
	provider = NlpEngineProvider(nlp_configuration=configuration)
	nlp_engine = provider.create_engine()

	registry = RecognizerRegistry()
	# add rule-based recognizers
	registry.load_predefined_recognizers(nlp_engine=nlp_engine)
	registry.add_recognizer(spacy_recognizer)
	# remove the nlp engine we passed, to use custom label mappings
	registry.remove_recognizer("SpacyRecognizer")

	analyzer = AnalyzerEngine(nlp_engine=nlp_engine,
	registry=registry, supported_languages=["en"])

	# uncomment for flair-based NLP recognizer
	# flair_recognizer = FlairRecognizer()
	# registry.load_predefined_recognizers()
	# registry.add_recognizer(flair_recognizer)
	# analyzer = AnalyzerEngine(registry=registry, supported_languages=["en"])
	return analyzer


	@st.cache(allow_output_mutation=True)
	def anonymizer_engine():
	"""Return AnonymizerEngine."""
	return AnonymizerEngine()


	def get_supported_entities():
	"""Return supported entities from the Analyzer Engine."""
	return analyzer_engine().get_supported_entities()


	def analyze(**kwargs):
	"""Analyze input using Analyzer engine and input arguments (kwargs)."""
	if "entities" not in kwargs or "All" in kwargs["entities"]:
	kwargs["entities"] = None
	return analyzer_engine().analyze(**kwargs)


	def anonymize(text, analyze_results):
	"""Anonymize identified input using Presidio Abonymizer."""
	if not text:
	return
	res = anonymizer_engine().anonymize(text, analyze_results)
	return res.text


	def annotate(text, st_analyze_results, st_entities):
	tokens = []
	# sort by start index
	results = sorted(st_analyze_results, key=lambda x: x.start)
	for i, res in enumerate(results):
	if i == 0:
	tokens.append(text[:res.start])

	# append entity text and entity type
	tokens.append((text[res.start: res.end], res.entity_type))

	# if another entity coming i.e. we're not at the last results element, add text up to next entity
	if i != len(results) - 1:
	tokens.append(text[res.end:results[i+1].start])
	# if no more entities coming, add all remaining text
	else:
	tokens.append(text[res.end:])
	return tokens


	st.set_page_config(page_title="Bitahoy demo", layout="wide")

	# Side bar
	# add picture with
	st.sidebar.image("assets/bitahoy-logo.png", width=200)

	st_entities = st.sidebar.multiselect(
	label="Which entities to look for?",
	options=get_supported_entities(),
	default=list(get_supported_entities()),
	)

	st_threshold = st.sidebar.slider(
	label="Acceptance threshold", min_value=0.0, max_value=1.0, value=0.35
	)

	st_return_decision_process = st.sidebar.checkbox(
	"Add analysis explanations in json")

	st.sidebar.markdown(
	"""
	Detect and anonymize PII in text using an [NLP model](https://huggingface.co/beki/en_spacy_pii_distilbert) trained on protocol traces (JSON, SQL, XML etc.) generated by
	[Privy](https://github.com/pixie-io/pixie/tree/main/src/datagen/pii/privy) and rule-based classifiers from [Presidio](https://aka.ms/presidio).
	"""
	)

	st.sidebar.info(
	"Privy is an open source framework for synthetic data generation in protocol trace formats (json, sql, html etc). Presidio is an open source framework for PII detection and anonymization. "
	"For more info visit [privy](https://github.com/pixie-io/pixie/tree/main/src/datagen/pii/privy) and [aka.ms/presidio](https://aka.ms/presidio)"
	)


	# Main panel
	analyzer_load_state = st.info(
	"Starting analyzer and loading model...")
	engine = analyzer_engine()
	analyzer_load_state.empty()

	# col?
	# Store the initial value of widgets in session state
	if "visibility" not in st.session_state:
	st.session_state.visibility = "visible"
	st.session_state.disabled = False

	col1, col2 = st.columns(2)

	with col1:
	# st.radio(
	# "Set selectbox label visibility 👉",
	# key="visibility",
	# options=["visible", "hidden", "collapsed"],
	# )
	st_text = st.text_area(
	label="Type in some text",
	value="SELECT shipping FROM users WHERE shipping = '201 Thayer St Providence RI 02912'"
	"\n\n"
	"{user: Willie Porter, ip: 192.168.2.80, email: [email protected]}",
	height=200,
	)

	with col2:
	st.checkbox("Enable/Disable selectbox widget", key="disabled")
	titles, urls, jsons = load_data("assets/data_sorted.csv")
	option_list = titles
	option = st.selectbox(
	"How would you like to be contacted?",
	option_list,
	# label_visibility=st.session_state.visibility,
	disabled=st.session_state.disabled,
	)
	st.write('You selected:', option)

	# end of col
	button = st.button("Detect PII")

	if 'first_load' not in st.session_state:
	st.session_state['first_load'] = True

	# After
	st.subheader("Analyzed")
	with st.spinner("Analyzing..."):
	if button or st.session_state.first_load:
	st_analyze_results = analyze(
	text=st_text,
	entities=st_entities,
	language="en",
	score_threshold=st_threshold,
	return_decision_process=st_return_decision_process,
	)
	annotated_tokens = annotate(st_text, st_analyze_results, st_entities)
	# annotated_tokens
	annotated_text(*annotated_tokens)
	# vertical space
	st.text("")

	st.subheader("Anonymized")

	with st.spinner("Anonymizing..."):
	if button or st.session_state.first_load:
	st_anonymize_results = anonymize(st_text, st_analyze_results)
	st_anonymize_results


	# table result
	st.subheader("Detailed Findings")
	if st_analyze_results:
	res_dicts = [r.to_dict() for r in st_analyze_results]
	for d in res_dicts:
	d['Value'] = st_text[d['start']:d['end']]
	df = pd.DataFrame.from_records(res_dicts)
	df = df[["entity_type", "Value", "score", "start", "end"]].rename(
	{
	"entity_type": "Entity type",
	"start": "Start",
	"end": "End",
	"score": "Confidence",
	},
	axis=1,
	)

	st.dataframe(df, width=1000)
	else:
	st.text("No findings")

	st.session_state['first_load'] = True

	# json result


	class ToDictListEncoder(JSONEncoder):
	"""Encode dict to json."""

	def default(self, o):
	"""Encode to JSON using to_dict."""
	if o:
	return o.to_dict()
	return []


	if st_return_decision_process:
	st.json(json.dumps(st_analyze_results, cls=ToDictListEncoder))