Spaces:

WordLift
/

entity-linking

Running

App Files Files

entity-linking / app.py

cyberandy

Update app.py

7d64ce5 verified 6 days ago

raw

history blame

8.65 kB

	import streamlit as st
	from annotated_text import annotated_text
	from refined.inference.processor import Refined
	import requests
	import json
	import spacy

	# Page config
	st.set_page_config(
	page_title="Entity Linking by WordLift",
	page_icon="fav-ico.png",
	layout="wide",
	initial_sidebar_state="collapsed",
	menu_items={
	'Get Help': 'https://wordlift.io/book-a-demo/',
	'About': "# This is a demo app for NEL/NED/NER and SEO"
	}
	)

	# Sidebar
	st.sidebar.image("logo-wordlift.png")
	language_options = {"English", "English - spaCy", "German"}
	# Set default to English to avoid an error on the first run
	selected_language = st.sidebar.selectbox("Select the Language", list(language_options), index=0)

	# Initialize model and entity set variables
	selected_model_name = None
	selected_entity_set = None

	# Based on selected language, configure model, entity set, and citation options
	if selected_language == "German" or selected_language == "English - spaCy":
	entity_fishing_citation = """
	@misc{entity-fishing,
	title = {entity-fishing},
	publisher = {GitHub},
	year = {2016--2023},
	archivePrefix = {swh},
	eprint = {1:dir:cb0ba3379413db12b0018b7c3af8d0d2d864139c}
	}
	"""
	with st.sidebar.expander('Citations'):
	st.markdown(entity_fishing_citation)
	else: # English (Refined)
	model_options = ["aida_model", "wikipedia_model_with_numbers"]
	entity_set_options = ["wikidata", "wikipedia"]

	selected_model_name = st.sidebar.selectbox("Select the Model", model_options)
	selected_entity_set = st.sidebar.selectbox("Select the Entity Set", entity_set_options)

	refined_citation = """
	@inproceedings{ayoola-etal-2022-refined,
	title = "{R}e{F}in{ED}: An Efficient Zero-shot-capable Approach to End-to-End Entity Linking",
	author = "Tom Ayoola, Shubhi Tyagi, Joseph Fisher, Christos Christodoulopoulos, Andrea Pierleoni",
	booktitle = "NAACL",
	year = "2022"
	}
	"""
	with st.sidebar.expander('Citations'):
	st.markdown(refined_citation)

	@st.cache_resource
	def load_model(selected_language, model_name=None, entity_set=None):
	# Define the public URL for the entity-fishing service
	entity_fishing_url = "https://cloud.science-miner.com/nerd/service"

	if selected_language == "German":
	# Load the German-specific model
	nlp_model_de = spacy.load("de_core_news_lg")
	# Add the entity-fishing pipe with the server URL configured
	nlp_model_de.add_pipe("entityfishing", config={"api_url": entity_fishing_url})
	return nlp_model_de

	elif selected_language == "English - spaCy":
	# Load English-specific model
	nlp_model_en = spacy.load("en_core_web_sm")
	# Add the entity-fishing pipe with the server URL configured
	nlp_model_en.add_pipe("entityfishing", config={"api_url": entity_fishing_url})
	return nlp_model_en

	else: # English (Refined)
	# Load the pretrained model for other languages
	refined_model = Refined.from_pretrained(model_name=model_name, entity_set=entity_set)
	return refined_model

	# Use the cached model
	# We pass the selected options directly to the cached function
	# Streamlit's caching handles re-running this only when the inputs change
	model = load_model(selected_language, selected_model_name, selected_entity_set)

	# Helper functions
	def get_wikidata_id(entity_id_string):
	# Handles IDs like "wikidata:Q123" or "wikidata=Q123"
	entity_id = entity_id_string.split(":")[-1].split("=")[-1]
	entity_link = "http://www.wikidata.org/entity/" + entity_id
	return {"id": entity_id, "link": entity_link}

	def get_entity_data(entity_link):
	try:
	# Format the entity_link
	formatted_link = entity_link.replace("http://", "http/")
	response = requests.get(f'https://api.wordlift.io/id/{formatted_link}')
	response.raise_for_status() # Raise an exception for bad status codes
	return response.json()
	except requests.exceptions.RequestException as e:
	st.warning(f"Could not fetch data for entity: {entity_link}. Error: {e}")
	return None

	# Create the form
	with st.form(key='my_form'):
	text_input = st.text_area(label='Enter a sentence', value="Angela Merkel was the first female chancellor of Germany.")
	submit_button = st.form_submit_button(label='Analyze')

	# Initialization
	entities_map = {}
	entities_data = {}

	if text_input:
	if selected_language in ["German", "English - spaCy"]:
	doc = model(text_input)
	spacy_entities = [(ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata) for ent in doc.ents]
	for entity in spacy_entities:
	entity_string, entity_type, wikidata_id, wikidata_url = entity
	if wikidata_url:
	formatted_wikidata_url = wikidata_url.replace("https://www.wikidata.org/wiki/", "http://www.wikidata.org/entity/")
	entities_map[entity_string] = {"id": wikidata_id, "link": formatted_wikidata_url}
	entity_data = get_entity_data(formatted_wikidata_url)

	if entity_data is not None:
	entities_data[entity_string] = entity_data
	else: # Refined model
	refined_entities = model.process_text(text_input)

	for entity in refined_entities:
	# More robustly access entity attributes instead of parsing a string
	if entity.entity_id and "wikidata" in entity.entity_id:
	entity_text = entity.text
	wikidata_info = get_wikidata_id(entity.entity_id)
	entities_map[entity_text] = wikidata_info
	entity_data = get_entity_data(wikidata_info["link"])
	if entity_data is not None:
	entities_data[entity_text] = entity_data

	combined_entity_info_dictionary = {
	k: [entities_map[k], entities_data.get(k)] for k in entities_map
	}

	if submit_button:
	# A more robust way to build the annotated_text list without using eval()
	final_text = []
	current_pos = 0

	# Create a simple list of (text, start, end) for sorting
	entity_spans = []
	if selected_language in ["German", "English - spaCy"]:
	# 'doc' is available from the processing block above
	for ent in doc.ents:
	if ent.text in entities_map: # only include linked entities
	entity_spans.append((ent.text, ent.start_char, ent.end_char))
	else:
	# 'refined_entities' is available
	for ent in refined_entities:
	if ent.text in entities_map:
	entity_spans.append((ent.text, ent.span[0], ent.span[1]))

	# Sort entities by their starting position to handle the text correctly
	sorted_entities = sorted(entity_spans, key=lambda x: x[1])

	for entity_string, start, end in sorted_entities:
	# Add the text segment before the current entity
	final_text.append(text_input[current_pos:start])

	# Prepare the annotation for the entity
	entity_info = entities_map.get(entity_string, {})
	entity_id = entity_info.get("id", "N/A")

	entity_type_data = entities_data.get(entity_string)
	entity_type = entity_type_data.get("@type") if entity_type_data else None

	color = {"Place": "#8AC7DB", "Organization": "#ADD8E6", "Person": "#67B7D1",
	"Product": "#2ea3f2", "CreativeWork": "#00BFFF", "Event": "#1E90FF"}.get(entity_type, "#8ef")

	final_text.append((entity_string, entity_id, color))
	current_pos = end

	# Add any remaining text after the last entity
	final_text.append(text_input[current_pos:])

	st.header("Annotated Text")
	annotated_text(*[item for item in final_text if item]) # Filter out empty strings

	# --- JSON-LD Generation ---
	json_ld_data = {
	"@context": "https://schema.org",
	"@type": "WebPage",
	"mentions": []
	}
	for entity_string, info_list in combined_entity_info_dictionary.items():
	entity_json_ld = info_list[1] # The data from WordLift API
	if entity_json_ld:
	json_ld_data["mentions"].append(entity_json_ld)

	with st.expander("See annotations"):
	st.write(combined_entity_info_dictionary)

	with st.expander("Here is the final JSON-LD"):
	st.json(json_ld_data)