import streamlit as st from annotated_text import annotated_text from refined.inference.processor import Refined import requests import json import spacy # Page config st.set_page_config( page_title="Entity Linking by WordLift", page_icon="fav-ico.png", layout="wide", initial_sidebar_state="collapsed", menu_items={ 'Get Help': 'https://wordlift.io/book-a-demo/', 'About': "# This is a demo app for NEL/NED/NER and SEO" } ) # Sidebar st.sidebar.image("logo-wordlift.png") language_options = {"English", "English - spaCy", "German"} # Set default to English to avoid an error on the first run selected_language = st.sidebar.selectbox("Select the Language", list(language_options), index=0) # Initialize model and entity set variables selected_model_name = None selected_entity_set = None # Based on selected language, configure model, entity set, and citation options if selected_language == "German" or selected_language == "English - spaCy": entity_fishing_citation = """ @misc{entity-fishing, title = {entity-fishing}, publisher = {GitHub}, year = {2016--2023}, archivePrefix = {swh}, eprint = {1:dir:cb0ba3379413db12b0018b7c3af8d0d2d864139c} } """ with st.sidebar.expander('Citations'): st.markdown(entity_fishing_citation) else: # English (Refined) model_options = ["aida_model", "wikipedia_model_with_numbers"] entity_set_options = ["wikidata", "wikipedia"] selected_model_name = st.sidebar.selectbox("Select the Model", model_options) selected_entity_set = st.sidebar.selectbox("Select the Entity Set", entity_set_options) refined_citation = """ @inproceedings{ayoola-etal-2022-refined, title = "{R}e{F}in{ED}: An Efficient Zero-shot-capable Approach to End-to-End Entity Linking", author = "Tom Ayoola, Shubhi Tyagi, Joseph Fisher, Christos Christodoulopoulos, Andrea Pierleoni", booktitle = "NAACL", year = "2022" } """ with st.sidebar.expander('Citations'): st.markdown(refined_citation) @st.cache_resource def load_model(selected_language, model_name=None, entity_set=None): # Define the public URL for the entity-fishing service entity_fishing_url = "https://cloud.science-miner.com/nerd/service" if selected_language == "German": # Load the German-specific model nlp_model_de = spacy.load("de_core_news_lg") # Add the entity-fishing pipe with the server URL configured nlp_model_de.add_pipe("entityfishing", config={"api_url": entity_fishing_url}) return nlp_model_de elif selected_language == "English - spaCy": # Load English-specific model nlp_model_en = spacy.load("en_core_web_sm") # Add the entity-fishing pipe with the server URL configured nlp_model_en.add_pipe("entityfishing", config={"api_url": entity_fishing_url}) return nlp_model_en else: # English (Refined) # Load the pretrained model for other languages refined_model = Refined.from_pretrained(model_name=model_name, entity_set=entity_set) return refined_model # Use the cached model # We pass the selected options directly to the cached function # Streamlit's caching handles re-running this only when the inputs change model = load_model(selected_language, selected_model_name, selected_entity_set) # Helper functions def get_wikidata_id(entity_id_string): # Handles IDs like "wikidata:Q123" or "wikidata=Q123" entity_id = entity_id_string.split(":")[-1].split("=")[-1] entity_link = "http://www.wikidata.org/entity/" + entity_id return {"id": entity_id, "link": entity_link} def get_entity_data(entity_link): try: # Format the entity_link formatted_link = entity_link.replace("http://", "http/") response = requests.get(f'https://api.wordlift.io/id/{formatted_link}') response.raise_for_status() # Raise an exception for bad status codes return response.json() except requests.exceptions.RequestException as e: st.warning(f"Could not fetch data for entity: {entity_link}. Error: {e}") return None # Create the form with st.form(key='my_form'): text_input = st.text_area(label='Enter a sentence', value="Angela Merkel was the first female chancellor of Germany.") submit_button = st.form_submit_button(label='Analyze') # Initialization entities_map = {} entities_data = {} if text_input: if selected_language in ["German", "English - spaCy"]: doc = model(text_input) spacy_entities = [(ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata) for ent in doc.ents] for entity in spacy_entities: entity_string, entity_type, wikidata_id, wikidata_url = entity if wikidata_url: formatted_wikidata_url = wikidata_url.replace("https://www.wikidata.org/wiki/", "http://www.wikidata.org/entity/") entities_map[entity_string] = {"id": wikidata_id, "link": formatted_wikidata_url} entity_data = get_entity_data(formatted_wikidata_url) if entity_data is not None: entities_data[entity_string] = entity_data else: # Refined model refined_entities = model.process_text(text_input) for entity in refined_entities: # More robustly access entity attributes instead of parsing a string if entity.entity_id and "wikidata" in entity.entity_id: entity_text = entity.text wikidata_info = get_wikidata_id(entity.entity_id) entities_map[entity_text] = wikidata_info entity_data = get_entity_data(wikidata_info["link"]) if entity_data is not None: entities_data[entity_text] = entity_data combined_entity_info_dictionary = { k: [entities_map[k], entities_data.get(k)] for k in entities_map } if submit_button: # A more robust way to build the annotated_text list without using eval() final_text = [] current_pos = 0 # Create a simple list of (text, start, end) for sorting entity_spans = [] if selected_language in ["German", "English - spaCy"]: # 'doc' is available from the processing block above for ent in doc.ents: if ent.text in entities_map: # only include linked entities entity_spans.append((ent.text, ent.start_char, ent.end_char)) else: # 'refined_entities' is available for ent in refined_entities: if ent.text in entities_map: entity_spans.append((ent.text, ent.span[0], ent.span[1])) # Sort entities by their starting position to handle the text correctly sorted_entities = sorted(entity_spans, key=lambda x: x[1]) for entity_string, start, end in sorted_entities: # Add the text segment before the current entity final_text.append(text_input[current_pos:start]) # Prepare the annotation for the entity entity_info = entities_map.get(entity_string, {}) entity_id = entity_info.get("id", "N/A") entity_type_data = entities_data.get(entity_string) entity_type = entity_type_data.get("@type") if entity_type_data else None color = {"Place": "#8AC7DB", "Organization": "#ADD8E6", "Person": "#67B7D1", "Product": "#2ea3f2", "CreativeWork": "#00BFFF", "Event": "#1E90FF"}.get(entity_type, "#8ef") final_text.append((entity_string, entity_id, color)) current_pos = end # Add any remaining text after the last entity final_text.append(text_input[current_pos:]) st.header("Annotated Text") annotated_text(*[item for item in final_text if item]) # Filter out empty strings # --- JSON-LD Generation --- json_ld_data = { "@context": "https://schema.org", "@type": "WebPage", "mentions": [] } for entity_string, info_list in combined_entity_info_dictionary.items(): entity_json_ld = info_list[1] # The data from WordLift API if entity_json_ld: json_ld_data["mentions"].append(entity_json_ld) with st.expander("See annotations"): st.write(combined_entity_info_dictionary) with st.expander("Here is the final JSON-LD"): st.json(json_ld_data)