Spaces:
Running
Running
File size: 8,645 Bytes
bbcf937 59c3f8c bbcf937 44b938c 24d58c0 117cafd cb76a4b 0bec8b3 542aecd d24252f 7d64ce5 4ac935a bbcf937 7d64ce5 8e7625e d24252f 7d64ce5 8e7625e 4ac935a 8e7625e 4ac935a 8e7625e 42d1bed 7d64ce5 9494755 7d64ce5 9494755 40806d0 7d64ce5 9494755 7d64ce5 74fb24f d24252f 7d64ce5 9494755 dedd775 320ee5a 7d64ce5 9494755 bbcf937 320ee5a 7d64ce5 b126447 c9574f5 971e940 44b938c b126447 9d9274e 7d64ce5 44b938c 7d64ce5 44b938c 9d9274e bbcf937 7d64ce5 c9574f5 bbcf937 dd4ee36 bbcf937 74fb24f fe49e8e 7d64ce5 b126447 890d925 9e9596c 890d925 d8a2dff 890d925 7d64ce5 117cafd 7d64ce5 0fe6ed0 7d64ce5 117cafd 98acdc3 7d64ce5 98acdc3 7d64ce5 98acdc3 117cafd 7d64ce5 24d58c0 49703d7 31c00d2 7d64ce5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
import streamlit as st
from annotated_text import annotated_text
from refined.inference.processor import Refined
import requests
import json
import spacy
# Page config
st.set_page_config(
page_title="Entity Linking by WordLift",
page_icon="fav-ico.png",
layout="wide",
initial_sidebar_state="collapsed",
menu_items={
'Get Help': 'https://wordlift.io/book-a-demo/',
'About': "# This is a demo app for NEL/NED/NER and SEO"
}
)
# Sidebar
st.sidebar.image("logo-wordlift.png")
language_options = {"English", "English - spaCy", "German"}
# Set default to English to avoid an error on the first run
selected_language = st.sidebar.selectbox("Select the Language", list(language_options), index=0)
# Initialize model and entity set variables
selected_model_name = None
selected_entity_set = None
# Based on selected language, configure model, entity set, and citation options
if selected_language == "German" or selected_language == "English - spaCy":
entity_fishing_citation = """
@misc{entity-fishing,
title = {entity-fishing},
publisher = {GitHub},
year = {2016--2023},
archivePrefix = {swh},
eprint = {1:dir:cb0ba3379413db12b0018b7c3af8d0d2d864139c}
}
"""
with st.sidebar.expander('Citations'):
st.markdown(entity_fishing_citation)
else: # English (Refined)
model_options = ["aida_model", "wikipedia_model_with_numbers"]
entity_set_options = ["wikidata", "wikipedia"]
selected_model_name = st.sidebar.selectbox("Select the Model", model_options)
selected_entity_set = st.sidebar.selectbox("Select the Entity Set", entity_set_options)
refined_citation = """
@inproceedings{ayoola-etal-2022-refined,
title = "{R}e{F}in{ED}: An Efficient Zero-shot-capable Approach to End-to-End Entity Linking",
author = "Tom Ayoola, Shubhi Tyagi, Joseph Fisher, Christos Christodoulopoulos, Andrea Pierleoni",
booktitle = "NAACL",
year = "2022"
}
"""
with st.sidebar.expander('Citations'):
st.markdown(refined_citation)
@st.cache_resource
def load_model(selected_language, model_name=None, entity_set=None):
# Define the public URL for the entity-fishing service
entity_fishing_url = "https://cloud.science-miner.com/nerd/service"
if selected_language == "German":
# Load the German-specific model
nlp_model_de = spacy.load("de_core_news_lg")
# Add the entity-fishing pipe with the server URL configured
nlp_model_de.add_pipe("entityfishing", config={"api_url": entity_fishing_url})
return nlp_model_de
elif selected_language == "English - spaCy":
# Load English-specific model
nlp_model_en = spacy.load("en_core_web_sm")
# Add the entity-fishing pipe with the server URL configured
nlp_model_en.add_pipe("entityfishing", config={"api_url": entity_fishing_url})
return nlp_model_en
else: # English (Refined)
# Load the pretrained model for other languages
refined_model = Refined.from_pretrained(model_name=model_name, entity_set=entity_set)
return refined_model
# Use the cached model
# We pass the selected options directly to the cached function
# Streamlit's caching handles re-running this only when the inputs change
model = load_model(selected_language, selected_model_name, selected_entity_set)
# Helper functions
def get_wikidata_id(entity_id_string):
# Handles IDs like "wikidata:Q123" or "wikidata=Q123"
entity_id = entity_id_string.split(":")[-1].split("=")[-1]
entity_link = "http://www.wikidata.org/entity/" + entity_id
return {"id": entity_id, "link": entity_link}
def get_entity_data(entity_link):
try:
# Format the entity_link
formatted_link = entity_link.replace("http://", "http/")
response = requests.get(f'https://api.wordlift.io/id/{formatted_link}')
response.raise_for_status() # Raise an exception for bad status codes
return response.json()
except requests.exceptions.RequestException as e:
st.warning(f"Could not fetch data for entity: {entity_link}. Error: {e}")
return None
# Create the form
with st.form(key='my_form'):
text_input = st.text_area(label='Enter a sentence', value="Angela Merkel was the first female chancellor of Germany.")
submit_button = st.form_submit_button(label='Analyze')
# Initialization
entities_map = {}
entities_data = {}
if text_input:
if selected_language in ["German", "English - spaCy"]:
doc = model(text_input)
spacy_entities = [(ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata) for ent in doc.ents]
for entity in spacy_entities:
entity_string, entity_type, wikidata_id, wikidata_url = entity
if wikidata_url:
formatted_wikidata_url = wikidata_url.replace("https://www.wikidata.org/wiki/", "http://www.wikidata.org/entity/")
entities_map[entity_string] = {"id": wikidata_id, "link": formatted_wikidata_url}
entity_data = get_entity_data(formatted_wikidata_url)
if entity_data is not None:
entities_data[entity_string] = entity_data
else: # Refined model
refined_entities = model.process_text(text_input)
for entity in refined_entities:
# More robustly access entity attributes instead of parsing a string
if entity.entity_id and "wikidata" in entity.entity_id:
entity_text = entity.text
wikidata_info = get_wikidata_id(entity.entity_id)
entities_map[entity_text] = wikidata_info
entity_data = get_entity_data(wikidata_info["link"])
if entity_data is not None:
entities_data[entity_text] = entity_data
combined_entity_info_dictionary = {
k: [entities_map[k], entities_data.get(k)] for k in entities_map
}
if submit_button:
# A more robust way to build the annotated_text list without using eval()
final_text = []
current_pos = 0
# Create a simple list of (text, start, end) for sorting
entity_spans = []
if selected_language in ["German", "English - spaCy"]:
# 'doc' is available from the processing block above
for ent in doc.ents:
if ent.text in entities_map: # only include linked entities
entity_spans.append((ent.text, ent.start_char, ent.end_char))
else:
# 'refined_entities' is available
for ent in refined_entities:
if ent.text in entities_map:
entity_spans.append((ent.text, ent.span[0], ent.span[1]))
# Sort entities by their starting position to handle the text correctly
sorted_entities = sorted(entity_spans, key=lambda x: x[1])
for entity_string, start, end in sorted_entities:
# Add the text segment before the current entity
final_text.append(text_input[current_pos:start])
# Prepare the annotation for the entity
entity_info = entities_map.get(entity_string, {})
entity_id = entity_info.get("id", "N/A")
entity_type_data = entities_data.get(entity_string)
entity_type = entity_type_data.get("@type") if entity_type_data else None
color = {"Place": "#8AC7DB", "Organization": "#ADD8E6", "Person": "#67B7D1",
"Product": "#2ea3f2", "CreativeWork": "#00BFFF", "Event": "#1E90FF"}.get(entity_type, "#8ef")
final_text.append((entity_string, entity_id, color))
current_pos = end
# Add any remaining text after the last entity
final_text.append(text_input[current_pos:])
st.header("Annotated Text")
annotated_text(*[item for item in final_text if item]) # Filter out empty strings
# --- JSON-LD Generation ---
json_ld_data = {
"@context": "https://schema.org",
"@type": "WebPage",
"mentions": []
}
for entity_string, info_list in combined_entity_info_dictionary.items():
entity_json_ld = info_list[1] # The data from WordLift API
if entity_json_ld:
json_ld_data["mentions"].append(entity_json_ld)
with st.expander("See annotations"):
st.write(combined_entity_info_dictionary)
with st.expander("Here is the final JSON-LD"):
st.json(json_ld_data) |