entity-linking / app.py
cyberandy's picture
Update app.py
7d64ce5 verified
raw
history blame
8.65 kB
import streamlit as st
from annotated_text import annotated_text
from refined.inference.processor import Refined
import requests
import json
import spacy
# Page config
st.set_page_config(
page_title="Entity Linking by WordLift",
page_icon="fav-ico.png",
layout="wide",
initial_sidebar_state="collapsed",
menu_items={
'Get Help': 'https://wordlift.io/book-a-demo/',
'About': "# This is a demo app for NEL/NED/NER and SEO"
}
)
# Sidebar
st.sidebar.image("logo-wordlift.png")
language_options = {"English", "English - spaCy", "German"}
# Set default to English to avoid an error on the first run
selected_language = st.sidebar.selectbox("Select the Language", list(language_options), index=0)
# Initialize model and entity set variables
selected_model_name = None
selected_entity_set = None
# Based on selected language, configure model, entity set, and citation options
if selected_language == "German" or selected_language == "English - spaCy":
entity_fishing_citation = """
@misc{entity-fishing,
title = {entity-fishing},
publisher = {GitHub},
year = {2016--2023},
archivePrefix = {swh},
eprint = {1:dir:cb0ba3379413db12b0018b7c3af8d0d2d864139c}
}
"""
with st.sidebar.expander('Citations'):
st.markdown(entity_fishing_citation)
else: # English (Refined)
model_options = ["aida_model", "wikipedia_model_with_numbers"]
entity_set_options = ["wikidata", "wikipedia"]
selected_model_name = st.sidebar.selectbox("Select the Model", model_options)
selected_entity_set = st.sidebar.selectbox("Select the Entity Set", entity_set_options)
refined_citation = """
@inproceedings{ayoola-etal-2022-refined,
title = "{R}e{F}in{ED}: An Efficient Zero-shot-capable Approach to End-to-End Entity Linking",
author = "Tom Ayoola, Shubhi Tyagi, Joseph Fisher, Christos Christodoulopoulos, Andrea Pierleoni",
booktitle = "NAACL",
year = "2022"
}
"""
with st.sidebar.expander('Citations'):
st.markdown(refined_citation)
@st.cache_resource
def load_model(selected_language, model_name=None, entity_set=None):
# Define the public URL for the entity-fishing service
entity_fishing_url = "https://cloud.science-miner.com/nerd/service"
if selected_language == "German":
# Load the German-specific model
nlp_model_de = spacy.load("de_core_news_lg")
# Add the entity-fishing pipe with the server URL configured
nlp_model_de.add_pipe("entityfishing", config={"api_url": entity_fishing_url})
return nlp_model_de
elif selected_language == "English - spaCy":
# Load English-specific model
nlp_model_en = spacy.load("en_core_web_sm")
# Add the entity-fishing pipe with the server URL configured
nlp_model_en.add_pipe("entityfishing", config={"api_url": entity_fishing_url})
return nlp_model_en
else: # English (Refined)
# Load the pretrained model for other languages
refined_model = Refined.from_pretrained(model_name=model_name, entity_set=entity_set)
return refined_model
# Use the cached model
# We pass the selected options directly to the cached function
# Streamlit's caching handles re-running this only when the inputs change
model = load_model(selected_language, selected_model_name, selected_entity_set)
# Helper functions
def get_wikidata_id(entity_id_string):
# Handles IDs like "wikidata:Q123" or "wikidata=Q123"
entity_id = entity_id_string.split(":")[-1].split("=")[-1]
entity_link = "http://www.wikidata.org/entity/" + entity_id
return {"id": entity_id, "link": entity_link}
def get_entity_data(entity_link):
try:
# Format the entity_link
formatted_link = entity_link.replace("http://", "http/")
response = requests.get(f'https://api.wordlift.io/id/{formatted_link}')
response.raise_for_status() # Raise an exception for bad status codes
return response.json()
except requests.exceptions.RequestException as e:
st.warning(f"Could not fetch data for entity: {entity_link}. Error: {e}")
return None
# Create the form
with st.form(key='my_form'):
text_input = st.text_area(label='Enter a sentence', value="Angela Merkel was the first female chancellor of Germany.")
submit_button = st.form_submit_button(label='Analyze')
# Initialization
entities_map = {}
entities_data = {}
if text_input:
if selected_language in ["German", "English - spaCy"]:
doc = model(text_input)
spacy_entities = [(ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata) for ent in doc.ents]
for entity in spacy_entities:
entity_string, entity_type, wikidata_id, wikidata_url = entity
if wikidata_url:
formatted_wikidata_url = wikidata_url.replace("https://www.wikidata.org/wiki/", "http://www.wikidata.org/entity/")
entities_map[entity_string] = {"id": wikidata_id, "link": formatted_wikidata_url}
entity_data = get_entity_data(formatted_wikidata_url)
if entity_data is not None:
entities_data[entity_string] = entity_data
else: # Refined model
refined_entities = model.process_text(text_input)
for entity in refined_entities:
# More robustly access entity attributes instead of parsing a string
if entity.entity_id and "wikidata" in entity.entity_id:
entity_text = entity.text
wikidata_info = get_wikidata_id(entity.entity_id)
entities_map[entity_text] = wikidata_info
entity_data = get_entity_data(wikidata_info["link"])
if entity_data is not None:
entities_data[entity_text] = entity_data
combined_entity_info_dictionary = {
k: [entities_map[k], entities_data.get(k)] for k in entities_map
}
if submit_button:
# A more robust way to build the annotated_text list without using eval()
final_text = []
current_pos = 0
# Create a simple list of (text, start, end) for sorting
entity_spans = []
if selected_language in ["German", "English - spaCy"]:
# 'doc' is available from the processing block above
for ent in doc.ents:
if ent.text in entities_map: # only include linked entities
entity_spans.append((ent.text, ent.start_char, ent.end_char))
else:
# 'refined_entities' is available
for ent in refined_entities:
if ent.text in entities_map:
entity_spans.append((ent.text, ent.span[0], ent.span[1]))
# Sort entities by their starting position to handle the text correctly
sorted_entities = sorted(entity_spans, key=lambda x: x[1])
for entity_string, start, end in sorted_entities:
# Add the text segment before the current entity
final_text.append(text_input[current_pos:start])
# Prepare the annotation for the entity
entity_info = entities_map.get(entity_string, {})
entity_id = entity_info.get("id", "N/A")
entity_type_data = entities_data.get(entity_string)
entity_type = entity_type_data.get("@type") if entity_type_data else None
color = {"Place": "#8AC7DB", "Organization": "#ADD8E6", "Person": "#67B7D1",
"Product": "#2ea3f2", "CreativeWork": "#00BFFF", "Event": "#1E90FF"}.get(entity_type, "#8ef")
final_text.append((entity_string, entity_id, color))
current_pos = end
# Add any remaining text after the last entity
final_text.append(text_input[current_pos:])
st.header("Annotated Text")
annotated_text(*[item for item in final_text if item]) # Filter out empty strings
# --- JSON-LD Generation ---
json_ld_data = {
"@context": "https://schema.org",
"@type": "WebPage",
"mentions": []
}
for entity_string, info_list in combined_entity_info_dictionary.items():
entity_json_ld = info_list[1] # The data from WordLift API
if entity_json_ld:
json_ld_data["mentions"].append(entity_json_ld)
with st.expander("See annotations"):
st.write(combined_entity_info_dictionary)
with st.expander("Here is the final JSON-LD"):
st.json(json_ld_data)