omri374 commited on
Commit
90730f5
1 Parent(s): 7a60951

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +153 -0
app.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Streamlit app for Presidio."""
2
+
3
+ import json
4
+ from json import JSONEncoder
5
+
6
+ import pandas as pd
7
+ import streamlit as st
8
+ from presidio_analyzer import AnalyzerEngine
9
+ from presidio_anonymizer import AnonymizerEngine
10
+
11
+ import spacy
12
+ spacy.cli.download("en_core_web_lg")
13
+
14
+
15
+ # Helper methods
16
+ @st.cache(allow_output_mutation=True)
17
+ def analyzer_engine():
18
+ """Return AnalyzerEngine."""
19
+
20
+ #transformers_recognizer = (TransformersRecognizer())
21
+
22
+ #registry = RecognizerRegistry()
23
+ #registry.add_recognizer(transformers_recognizer)
24
+
25
+ #analyzer = AnalyzerEngine(registry=registry)
26
+ #return analyzer
27
+
28
+
29
+ return AnalyzerEngine()
30
+
31
+
32
+ @st.cache(allow_output_mutation=True)
33
+ def anonymizer_engine():
34
+ """Return AnonymizerEngine."""
35
+ return AnonymizerEngine()
36
+
37
+
38
+ def get_supported_entities():
39
+ """Return supported entities from the Analyzer Engine."""
40
+ return analyzer_engine().get_supported_entities()
41
+
42
+
43
+ def analyze(**kwargs):
44
+ """Analyze input using Analyzer engine and input arguments (kwargs)."""
45
+ if "entities" not in kwargs or "All" in kwargs["entities"]:
46
+ kwargs["entities"] = None
47
+ return analyzer_engine().analyze(**kwargs)
48
+
49
+
50
+ def anonymize(text, analyze_results):
51
+ """Anonymize identified input using Presidio Abonymizer."""
52
+
53
+ res = anonymizer_engine().anonymize(text, analyze_results)
54
+ return res.text
55
+
56
+
57
+ st.set_page_config(page_title="Presidio demo", layout="wide")
58
+
59
+ # Side bar
60
+ st.sidebar.markdown(
61
+ """
62
+ Anonymize PII entities with [presidio](https://aka.ms/presidio).
63
+ """
64
+ )
65
+
66
+ st_entities = st.sidebar.multiselect(
67
+ label="Which entities to look for?",
68
+ options=get_supported_entities(),
69
+ default=list(get_supported_entities()),
70
+ )
71
+
72
+ st_threhsold = st.sidebar.slider(
73
+ label="Acceptance threshold", min_value=0.0, max_value=1.0, value=0.35
74
+ )
75
+
76
+ st_return_decision_process = st.sidebar.checkbox("Add analysis explanations in json")
77
+
78
+ st.sidebar.info(
79
+ "Presidio is an open source framework for PII detection and anonymization. "
80
+ "For more info visit [aka.ms/presidio](https://aka.ms/presidio)"
81
+ )
82
+
83
+
84
+ # Main panel
85
+ analyzer_load_state = st.info("Starting Presidio analyzer...")
86
+ engine = analyzer_engine()
87
+ analyzer_load_state.empty()
88
+
89
+
90
+ # Create two columns for before and after
91
+ col1, col2 = st.columns(2)
92
+
93
+ # Before:
94
+ col1.subheader("Input string:")
95
+ st_text = col1.text_area(
96
+ label="Enter text",
97
+ value="Type in some text, "
98
+ "like a phone number (212-141-4544) "
99
+ "or a name (Lebron James).",
100
+ height=400,
101
+ )
102
+
103
+ # After
104
+ col2.subheader("Output:")
105
+
106
+ st_analyze_results = analyze(
107
+ text=st_text,
108
+ entities=st_entities,
109
+ language="en",
110
+ score_threshold=st_threhsold,
111
+ return_decision_process=st_return_decision_process,
112
+ )
113
+ st_anonymize_results = anonymize(st_text, st_analyze_results)
114
+ col2.text_area(label="", value=st_anonymize_results, height=400)
115
+
116
+
117
+ # table result
118
+ st.subheader("Findings")
119
+ if st_analyze_results:
120
+ df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
121
+ df = df[["entity_type", "start", "end", "score"]].rename(
122
+ {
123
+ "entity_type": "Entity type",
124
+ "start": "Start",
125
+ "end": "End",
126
+ "score": "Confidence",
127
+ },
128
+ axis=1,
129
+ )
130
+
131
+ st.dataframe(df, width=1000)
132
+ else:
133
+ st.text("No findings")
134
+
135
+
136
+ # json result
137
+ class ToDictEncoder(JSONEncoder):
138
+ """Encode dict to json."""
139
+
140
+ def default(self, o):
141
+ """Encode to JSON using to_dict."""
142
+ return o.to_dict()
143
+
144
+
145
+ st.json(json.dumps(st_analyze_results, cls=ToDictEncoder))
146
+
147
+
148
+
149
+
150
+ import gradio as gr
151
+
152
+ from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
153
+