Spaces:

hs-knowledge
/

ner_app

Sleeping

App Files Files Community

ner_app / app.py

finiteautomata

Reuse NER stuff

a188b38 about 2 years ago

raw

history blame

3.88 kB

	# Streamlit app to highlight NER entities
	import random
	import streamlit as st
	from datasets import load_dataset
	from annotated_text import annotated_text

	# Load data
	ds = load_dataset("hs-knowledge/hateval_enriched")


	# Show highlighted ner entities in a tweet
	def display_ner(example):
	ner_output = example["ner_output"]
	chunks = []
	current_chunk = ""
	current_type = None

	# Check if there are two labels repeated
	previous_label = None

	for label in ner_output["labels"]:
	if (
	label
	and previous_label
	and previous_label == label
	and label != "O"
	and not label.startswith("I-")
	and not label.startswith("B-")
	):
	pass
	previous_label = label

	for token, label in zip(ner_output["tokens"], ner_output["labels"]):
	if label is None:
	# Perhaps it is too long
	continue
	if label == "O":
	if current_type is not None:
	# Add previous entity
	chunks.append((current_chunk.strip(), current_type))
	current_chunk = token + " "
	current_type = None
	else:
	current_chunk += token + " "
	current_type = None
	elif label.startswith("B-"):
	if current_chunk:
	chunks.append((current_chunk.strip(), current_type))
	current_chunk = token + " "
	current_type = label[2:]
	elif label.startswith("I-"):
	current_chunk += token + " "
	current_type = label[2:]
	else:
	# It doesn't start with B- or I- => add single token
	if label != current_type:
	chunks.append((current_chunk.strip(), current_type))
	current_chunk = token + " "
	current_type = label
	else:
	current_chunk += token + " "
	current_type = label

	if current_chunk:
	chunks.append((current_chunk.strip(), current_type))

	# Display text
	chunks = [(c, t) if t is not None else c for c, t in chunks]
	annotated_text(*chunks)


	def display_text(example):
	# Use annotated_text to show entities
	text = example["text"]

	# Sort entities by start
	entities = sorted(example["entities"], key=lambda x: x["start"])

	for entity in entities:
	entity_text = entity["text"]
	# find in text
	start = text.find(entity_text)
	end = start + len(entity_text)
	entity["start"] = start
	entity["end"] = end
	# Chunk text

	if len(entities) == 0:
	annotated_text(*[text])
	return

	chunks = []
	last_index = 0
	for i in range(len(entities)):
	entity = entities[i]
	start, end = entity["start"], entity["end"]

	if last_index < start:
	chunk_before_entity = text[last_index : entity["start"]]
	chunks.append((chunk_before_entity, None))
	chunks.append((entity["text"], entity["type"]))

	last_index = end

	if last_index < len(text):
	chunks.append((text[last_index:], None))

	# description = entity["kg_result"]["detailedDescription"]["articleBody"]
	chunks = [(c, t) if t is not None else c for c, t in chunks]
	annotated_text(*chunks)


	# Get first 1000 examples

	elements = random.choices(range(len(ds["train"])), k=50)
	ds["train"] = ds["train"].select(elements)

	for ex in ds["train"]:
	# display_text(ex)
	st.markdown("---")
	display_ner(ex)
	with st.expander("Show entities"):
	for ent in ex["entities"]:
	entity_name = ent["text"]
	entity_type = ent["type"]
	entity_description = ent["kg_result"]["detailedDescription"]["articleBody"]
	st.write(f"{entity_name} ({entity_type}): {entity_description}")