Spaces:
Sleeping
Sleeping
File size: 4,143 Bytes
8739181 a188b38 8739181 a188b38 8739181 43e2fd9 8739181 43e2fd9 3f556fb 3b3110b 3f556fb 8739181 3f556fb 3b3110b 3f556fb 8739181 43e2fd9 8739181 3f556fb 8739181 3f556fb a188b38 3b3110b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
# Streamlit app to highlight NER entities
import random
import streamlit as st
from datasets import load_dataset
from annotated_text import annotated_text
# Show highlighted ner entities in a tweet
def display_ner(example):
ner_output = example["ner_output"]
chunks = []
current_chunk = ""
current_type = None
# Check if there are two labels repeated
previous_label = None
for label in ner_output["labels"]:
if (
label
and previous_label
and previous_label == label
and label != "O"
and not label.startswith("I-")
and not label.startswith("B-")
):
pass
previous_label = label
for token, label in zip(ner_output["tokens"], ner_output["labels"]):
if label is None:
# Perhaps it is too long
continue
if label == "O":
if current_type is not None:
# Add previous entity
chunks.append((current_chunk.strip(), current_type))
current_chunk = token + " "
current_type = None
else:
current_chunk += token + " "
current_type = None
elif label.startswith("B-"):
if current_chunk:
chunks.append((current_chunk.strip(), current_type))
current_chunk = token + " "
current_type = label[2:]
elif label.startswith("I-"):
current_chunk += token + " "
current_type = label[2:]
else:
# It doesn't start with B- or I- => add single token
if label != current_type:
chunks.append((current_chunk.strip(), current_type))
current_chunk = token + " "
current_type = label
else:
current_chunk += token + " "
current_type = label
if current_chunk:
chunks.append((current_chunk.strip(), current_type))
# Display text
chunks = [(c, t) if t is not None else c for c, t in chunks]
annotated_text(*chunks)
def display_text(example, text_column):
# Use annotated_text to show entities
text = example[text_column]
# Sort entities by start
entities = sorted(example["entities"], key=lambda x: x["start"])
for entity in entities:
entity_text = entity["text"]
# find in text
start = text.find(entity_text)
end = start + len(entity_text)
entity["start"] = start
entity["end"] = end
# Chunk text
if len(entities) == 0:
annotated_text(*[text])
return
chunks = []
last_index = 0
for i in range(len(entities)):
entity = entities[i]
start, end = entity["start"], entity["end"]
if last_index < start:
chunk_before_entity = text[last_index : entity["start"]]
chunks.append((chunk_before_entity, None))
chunks.append((entity["text"], entity["type"]))
last_index = end
if last_index < len(text):
chunks.append((text[last_index:], None))
# description = entity["kg_result"]["detailedDescription"]["articleBody"]
chunks = [(c, t) if t is not None else c for c, t in chunks]
annotated_text(*chunks)
# selectbox to choose dataset
selected_dataset = st.sidebar.selectbox(
"Select dataset", ["hateval_enriched", "sbf-enriched", "hatecheck-enriched"]
)
# Load data
ds = load_dataset(f"hs-knowledge/{selected_dataset}")
text_column = {
"hateval_enriched": "text",
"sbf-enriched": "post",
"hatecheck-enriched": "test_case",
}
elements = random.choices(range(len(ds["train"])), k=50)
ds["train"] = ds["train"].select(elements)
for ex in ds["train"]:
# display_text(ex)
st.markdown("---")
display_ner(ex)
with st.expander("Show entities"):
for ent in ex["entities"]:
entity_name = ent["text"]
entity_type = ent["type"]
entity_description = ent["kg_result"]["detailedDescription"]["articleBody"]
st.write(f"{entity_name} ({entity_type}): {entity_description}")
|