Spaces:
Sleeping
Sleeping
File size: 6,722 Bytes
50dc05d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 |
import nltk
import streamlit as st
from nltk.tokenize import sent_tokenize
from transformers import pipeline
st.set_page_config(page_title="Relation Extraction App", page_icon="🔍", layout="wide")
nltk.download("punkt")
relation_pipe = pipeline(
"text-classification",
model="harshildarji/privacy-policy-relation-extraction",
return_all_scores=True,
framework="pt",
)
ner_pipe = pipeline(
"token-classification",
model="PaDaS-Lab/gdpr-privacy-policy-ner",
aggregation_strategy="simple",
framework="pt",
)
classes_gdpr = {
"DC": "Data Controller",
"DP": "Data Processor",
"DPO": "Data Protection Officer",
"R": "Recipient",
"TP": "Third Party",
"A": "Authority",
"DS": "Data Subject",
"DSO": "Data Source",
"RP": "Required Purpose",
"NRP": "Not-Required Purpose",
"P": "Processing",
"NPD": "Non-Personal Data",
"PD": "Personal Data",
"OM": "Organisational Measure",
"TM": "Technical Measure",
"LB": "Legal Basis",
"CONS": "Consent",
"CONT": "Contract",
"LI": "Legitimate Interest",
"ADM": "Automated Decision Making",
"RET": "Retention",
"SEU": "Scale EU",
"SNEU": "Scale Non-EU",
"RI": "Right",
"DSR15": "Art. 15 Right of access by the data subject",
"DSR16": "Art. 16 Right to rectification",
"DSR17": "Art. 17 Right to erasure (‘right to be forgotten’)",
"DSR18": "Art. 18 Right to restriction of processing",
"DSR19": "Notification obligation regarding rectification or erasure of personal data or restriction of processing",
"DSR20": "Art. 20 Right to data portability",
"DSR21": "Art. 21 Right to object",
"DSR22": "Art. 22 Automated individual decision-making, including profiling",
"LC": "Lodge Complaint",
}
@st.cache_data
def classify_sentences(text):
sentences = sent_tokenize(text)
results = relation_pipe(sentences)
return sentences, results
@st.cache_data
def get_ner_annotations(sentence):
ner_results = ner_pipe(sentence)
return ner_results
def annotate_sentence(sentence, ner_results):
spans = []
current_entity = None
current_start = None
current_end = None
for ner in ner_results:
entity_group = ner["entity_group"]
entity = classes_gdpr.get(entity_group, entity_group)
start = ner["start"]
end = ner["end"]
if current_entity == entity:
current_end = end
else:
if current_entity is not None:
spans.append((current_start, current_end, current_entity))
current_entity = entity
current_start = start
current_end = end
if current_entity is not None:
spans.append((current_start, current_end, current_entity))
annotated_sentence = ""
last_idx = 0
for start, end, entity in spans:
annotated_sentence += sentence[last_idx:start]
annotated_sentence += f"<span class='tooltip' style='text-decoration: underline;'>{sentence[start:end]}<span class='tooltiptext'>{entity}</span></span>"
last_idx = end
annotated_sentence += sentence[last_idx:]
return annotated_sentence
st.markdown(
"""
<style>
.tooltip {
position: relative;
display: inline-block;
}
.tooltip .tooltiptext {
visibility: hidden;
width: auto;
background-color: black;
color: #fff;
text-align: center;
border-radius: 6px;
padding: 5px;
position: absolute;
z-index: 1;
bottom: 125%;
left: 50%;
transform: translateX(-50%);
font-size: 12px;
white-space: nowrap;
}
.tooltip:hover .tooltiptext {
visibility: visible;
transition: visibility 0s linear 0s;
}
</style>
""",
unsafe_allow_html=True,
)
def get_top_labels(results, top_n=2):
top_labels = []
for result in results:
sorted_result = sorted(result, key=lambda x: x["score"], reverse=True)[:top_n]
top_labels.append(sorted_result)
return top_labels
st.title("Relation Extraction App")
st.sidebar.title("Identified relation labels:")
text = st.text_area(
"Enter your text here:",
value="We may use these technologies to collect information when you interact with services we offer through one of our partners, such as advertising and commerce features. Most web browsers are set to accept cookies by default. It is up to you to move or reject browser cookies through the settings on your browser or device. Removing or rejecting cookies may affect our service function and availability.",
)
if st.button("Analyze"):
if text:
sentences, results = classify_sentences(text)
top_labels = get_top_labels(results, top_n=2)
labels_dict = {}
for sentence, result in zip(sentences, top_labels):
for res in result:
label = res["label"]
score = res["score"]
if label not in labels_dict:
labels_dict[label] = []
labels_dict[label].append((sentence, score))
st.session_state.labels_dict = labels_dict
if "labels_dict" not in st.session_state:
st.markdown(
"""
<style>
.hint {
color: rgba(41, 134, 204, 0.6);
font-size: 16px;
}
</style>
<h4 class="hint">Notes:</h4>
<ul class="hint">
<li>Enter text in the text area above,</li>
<li>The relation labels will be displayed in the sidebar,</li>
<li>Click on any label to see the corresponding sentences,</li>
<li>In displayed sentences, hover over underlined words to see their corresponding NER tag.</li>
</ul>
""",
unsafe_allow_html=True,
)
if "labels_dict" in st.session_state:
labels_dict = st.session_state.labels_dict
for label in labels_dict.keys():
if st.sidebar.button(label):
st.markdown(
f"Sentences with relation label: <strong><span style='color: #FF4B4B; font-size: 1.2em;'>{label}</span></strong>",
unsafe_allow_html=True,
)
for sentence, score in labels_dict[label]:
ner_results = get_ner_annotations(sentence)
annotated_sentence = annotate_sentence(sentence, ner_results)
st.markdown(
f"<div style='background-color: rgba(143, 203, 249, 0.1); padding: 10px; border-radius: 7px; margin: 5px 0;'>{annotated_sentence} <span style='color: #C71585; font-weight: bold;'>({score:.2f})</span></div>",
unsafe_allow_html=True,
)
|