Spaces:
Running
Running
import streamlit as st | |
from annotated_text import annotated_text | |
from refined.inference.processor import Refined | |
import requests | |
import json | |
import spacy | |
# Page config | |
st.set_page_config( | |
page_title="Entity Linking by WordLift", | |
page_icon="fav-ico.png", | |
layout="wide", | |
initial_sidebar_state="collapsed", | |
menu_items={ | |
'Get Help': 'https://wordlift.io/book-a-demo/', | |
'About': "# This is a demo app for NEL/NED/NER and SEO" | |
} | |
) | |
# Sidebar | |
st.sidebar.image("logo-wordlift.png") | |
language_options = {"English", "English - spaCy", "German"} | |
# Set default to English to avoid an error on the first run | |
selected_language = st.sidebar.selectbox("Select the Language", list(language_options), index=0) | |
# Initialize model and entity set variables | |
selected_model_name = None | |
selected_entity_set = None | |
# Based on selected language, configure model, entity set, and citation options | |
if selected_language == "German" or selected_language == "English - spaCy": | |
entity_fishing_citation = """ | |
@misc{entity-fishing, | |
title = {entity-fishing}, | |
publisher = {GitHub}, | |
year = {2016--2023}, | |
archivePrefix = {swh}, | |
eprint = {1:dir:cb0ba3379413db12b0018b7c3af8d0d2d864139c} | |
} | |
""" | |
with st.sidebar.expander('Citations'): | |
st.markdown(entity_fishing_citation) | |
else: # English (Refined) | |
model_options = ["aida_model", "wikipedia_model_with_numbers"] | |
entity_set_options = ["wikidata", "wikipedia"] | |
selected_model_name = st.sidebar.selectbox("Select the Model", model_options) | |
selected_entity_set = st.sidebar.selectbox("Select the Entity Set", entity_set_options) | |
refined_citation = """ | |
@inproceedings{ayoola-etal-2022-refined, | |
title = "{R}e{F}in{ED}: An Efficient Zero-shot-capable Approach to End-to-End Entity Linking", | |
author = "Tom Ayoola, Shubhi Tyagi, Joseph Fisher, Christos Christodoulopoulos, Andrea Pierleoni", | |
booktitle = "NAACL", | |
year = "2022" | |
} | |
""" | |
with st.sidebar.expander('Citations'): | |
st.markdown(refined_citation) | |
def load_model(selected_language, model_name=None, entity_set=None): | |
# Define the public URL for the entity-fishing service | |
entity_fishing_url = "https://cloud.science-miner.com/nerd/service" | |
if selected_language == "German": | |
# Load the German-specific model | |
nlp_model_de = spacy.load("de_core_news_lg") | |
# Add the entity-fishing pipe with the server URL configured | |
nlp_model_de.add_pipe("entityfishing", config={"api_url": entity_fishing_url}) | |
return nlp_model_de | |
elif selected_language == "English - spaCy": | |
# Load English-specific model | |
nlp_model_en = spacy.load("en_core_web_sm") | |
# Add the entity-fishing pipe with the server URL configured | |
nlp_model_en.add_pipe("entityfishing", config={"api_url": entity_fishing_url}) | |
return nlp_model_en | |
else: # English (Refined) | |
# Load the pretrained model for other languages | |
refined_model = Refined.from_pretrained(model_name=model_name, entity_set=entity_set) | |
return refined_model | |
# Use the cached model | |
# We pass the selected options directly to the cached function | |
# Streamlit's caching handles re-running this only when the inputs change | |
model = load_model(selected_language, selected_model_name, selected_entity_set) | |
# Helper functions | |
def get_wikidata_id(entity_id_string): | |
# Handles IDs like "wikidata:Q123" or "wikidata=Q123" | |
entity_id = entity_id_string.split(":")[-1].split("=")[-1] | |
entity_link = "http://www.wikidata.org/entity/" + entity_id | |
return {"id": entity_id, "link": entity_link} | |
def get_entity_data(entity_link): | |
try: | |
# Format the entity_link | |
formatted_link = entity_link.replace("http://", "http/") | |
response = requests.get(f'https://api.wordlift.io/id/{formatted_link}') | |
response.raise_for_status() # Raise an exception for bad status codes | |
return response.json() | |
except requests.exceptions.RequestException as e: | |
st.warning(f"Could not fetch data for entity: {entity_link}. Error: {e}") | |
return None | |
# Create the form | |
with st.form(key='my_form'): | |
text_input = st.text_area(label='Enter a sentence', value="Angela Merkel was the first female chancellor of Germany.") | |
submit_button = st.form_submit_button(label='Analyze') | |
# Initialization | |
entities_map = {} | |
entities_data = {} | |
if text_input: | |
if selected_language in ["German", "English - spaCy"]: | |
doc = model(text_input) | |
spacy_entities = [(ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata) for ent in doc.ents] | |
for entity in spacy_entities: | |
entity_string, entity_type, wikidata_id, wikidata_url = entity | |
if wikidata_url: | |
formatted_wikidata_url = wikidata_url.replace("https://www.wikidata.org/wiki/", "http://www.wikidata.org/entity/") | |
entities_map[entity_string] = {"id": wikidata_id, "link": formatted_wikidata_url} | |
entity_data = get_entity_data(formatted_wikidata_url) | |
if entity_data is not None: | |
entities_data[entity_string] = entity_data | |
else: # Refined model | |
refined_entities = model.process_text(text_input) | |
for entity in refined_entities: | |
# More robustly access entity attributes instead of parsing a string | |
if entity.entity_id and "wikidata" in entity.entity_id: | |
entity_text = entity.text | |
wikidata_info = get_wikidata_id(entity.entity_id) | |
entities_map[entity_text] = wikidata_info | |
entity_data = get_entity_data(wikidata_info["link"]) | |
if entity_data is not None: | |
entities_data[entity_text] = entity_data | |
combined_entity_info_dictionary = { | |
k: [entities_map[k], entities_data.get(k)] for k in entities_map | |
} | |
if submit_button: | |
# A more robust way to build the annotated_text list without using eval() | |
final_text = [] | |
current_pos = 0 | |
# Create a simple list of (text, start, end) for sorting | |
entity_spans = [] | |
if selected_language in ["German", "English - spaCy"]: | |
# 'doc' is available from the processing block above | |
for ent in doc.ents: | |
if ent.text in entities_map: # only include linked entities | |
entity_spans.append((ent.text, ent.start_char, ent.end_char)) | |
else: | |
# 'refined_entities' is available | |
for ent in refined_entities: | |
if ent.text in entities_map: | |
entity_spans.append((ent.text, ent.span[0], ent.span[1])) | |
# Sort entities by their starting position to handle the text correctly | |
sorted_entities = sorted(entity_spans, key=lambda x: x[1]) | |
for entity_string, start, end in sorted_entities: | |
# Add the text segment before the current entity | |
final_text.append(text_input[current_pos:start]) | |
# Prepare the annotation for the entity | |
entity_info = entities_map.get(entity_string, {}) | |
entity_id = entity_info.get("id", "N/A") | |
entity_type_data = entities_data.get(entity_string) | |
entity_type = entity_type_data.get("@type") if entity_type_data else None | |
color = {"Place": "#8AC7DB", "Organization": "#ADD8E6", "Person": "#67B7D1", | |
"Product": "#2ea3f2", "CreativeWork": "#00BFFF", "Event": "#1E90FF"}.get(entity_type, "#8ef") | |
final_text.append((entity_string, entity_id, color)) | |
current_pos = end | |
# Add any remaining text after the last entity | |
final_text.append(text_input[current_pos:]) | |
st.header("Annotated Text") | |
annotated_text(*[item for item in final_text if item]) # Filter out empty strings | |
# --- JSON-LD Generation --- | |
json_ld_data = { | |
"@context": "https://schema.org", | |
"@type": "WebPage", | |
"mentions": [] | |
} | |
for entity_string, info_list in combined_entity_info_dictionary.items(): | |
entity_json_ld = info_list[1] # The data from WordLift API | |
if entity_json_ld: | |
json_ld_data["mentions"].append(entity_json_ld) | |
with st.expander("See annotations"): | |
st.write(combined_entity_info_dictionary) | |
with st.expander("Here is the final JSON-LD"): | |
st.json(json_ld_data) |