File size: 8,645 Bytes
bbcf937
59c3f8c
bbcf937
44b938c
24d58c0
117cafd
 
cb76a4b
 
 
 
 
 
 
 
 
 
 
 
0bec8b3
542aecd
d24252f
7d64ce5
4ac935a
bbcf937
7d64ce5
 
 
 
8e7625e
d24252f
 
 
 
 
 
 
 
 
 
 
 
7d64ce5
8e7625e
 
 
 
 
4ac935a
8e7625e
4ac935a
 
 
 
 
 
 
 
8e7625e
42d1bed
7d64ce5
9494755
7d64ce5
 
 
9494755
 
40806d0
7d64ce5
 
9494755
7d64ce5
74fb24f
d24252f
 
7d64ce5
 
 
 
 
9494755
 
 
dedd775
320ee5a
7d64ce5
 
9494755
bbcf937
320ee5a
7d64ce5
 
 
b126447
c9574f5
971e940
44b938c
 
b126447
 
9d9274e
7d64ce5
44b938c
7d64ce5
 
44b938c
9d9274e
bbcf937
 
7d64ce5
c9574f5
bbcf937
dd4ee36
 
 
 
bbcf937
74fb24f
fe49e8e
7d64ce5
 
b126447
890d925
9e9596c
890d925
 
d8a2dff
890d925
 
7d64ce5
 
 
 
 
 
 
 
 
 
117cafd
7d64ce5
0fe6ed0
7d64ce5
 
 
117cafd
98acdc3
7d64ce5
98acdc3
7d64ce5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98acdc3
117cafd
 
 
 
7d64ce5
 
 
 
24d58c0
49703d7
31c00d2
 
 
7d64ce5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import streamlit as st
from annotated_text import annotated_text
from refined.inference.processor import Refined
import requests
import json
import spacy

# Page config
st.set_page_config(
    page_title="Entity Linking by WordLift",
    page_icon="fav-ico.png",
    layout="wide",
    initial_sidebar_state="collapsed",
    menu_items={
        'Get Help': 'https://wordlift.io/book-a-demo/',
        'About': "# This is a demo app for NEL/NED/NER and SEO"
    }
)

# Sidebar
st.sidebar.image("logo-wordlift.png")
language_options = {"English", "English - spaCy", "German"}
# Set default to English to avoid an error on the first run
selected_language = st.sidebar.selectbox("Select the Language", list(language_options), index=0)

# Initialize model and entity set variables
selected_model_name = None
selected_entity_set = None

# Based on selected language, configure model, entity set, and citation options
if selected_language == "German" or selected_language == "English - spaCy":
    entity_fishing_citation = """
    @misc{entity-fishing,
    title = {entity-fishing},
    publisher = {GitHub},
    year = {2016--2023},
    archivePrefix = {swh},
    eprint = {1:dir:cb0ba3379413db12b0018b7c3af8d0d2d864139c}
    }
    """
    with st.sidebar.expander('Citations'):
        st.markdown(entity_fishing_citation)
else: # English (Refined)
    model_options = ["aida_model", "wikipedia_model_with_numbers"]
    entity_set_options = ["wikidata", "wikipedia"]
    
    selected_model_name = st.sidebar.selectbox("Select the Model", model_options)
    selected_entity_set = st.sidebar.selectbox("Select the Entity Set", entity_set_options)

    refined_citation = """
    @inproceedings{ayoola-etal-2022-refined,
    title = "{R}e{F}in{ED}: An Efficient Zero-shot-capable Approach to End-to-End Entity Linking",
    author = "Tom Ayoola, Shubhi Tyagi, Joseph Fisher, Christos Christodoulopoulos, Andrea Pierleoni",
    booktitle = "NAACL",
    year = "2022"
    }
    """
    with st.sidebar.expander('Citations'):
        st.markdown(refined_citation)

@st.cache_resource
def load_model(selected_language, model_name=None, entity_set=None):
    # Define the public URL for the entity-fishing service
    entity_fishing_url = "https://cloud.science-miner.com/nerd/service"
    
    if selected_language == "German":
        # Load the German-specific model
        nlp_model_de = spacy.load("de_core_news_lg")
        # Add the entity-fishing pipe with the server URL configured
        nlp_model_de.add_pipe("entityfishing", config={"api_url": entity_fishing_url})
        return nlp_model_de
        
    elif selected_language == "English - spaCy":
        # Load English-specific model
        nlp_model_en = spacy.load("en_core_web_sm")
        # Add the entity-fishing pipe with the server URL configured
        nlp_model_en.add_pipe("entityfishing", config={"api_url": entity_fishing_url})
        return nlp_model_en
        
    else: # English (Refined)
        # Load the pretrained model for other languages
        refined_model = Refined.from_pretrained(model_name=model_name, entity_set=entity_set)
        return refined_model

# Use the cached model
# We pass the selected options directly to the cached function
# Streamlit's caching handles re-running this only when the inputs change
model = load_model(selected_language, selected_model_name, selected_entity_set)

# Helper functions
def get_wikidata_id(entity_id_string):
    # Handles IDs like "wikidata:Q123" or "wikidata=Q123"
    entity_id = entity_id_string.split(":")[-1].split("=")[-1]
    entity_link = "http://www.wikidata.org/entity/" + entity_id
    return {"id": entity_id, "link": entity_link}
    
def get_entity_data(entity_link):
    try:
        # Format the entity_link
        formatted_link = entity_link.replace("http://", "http/")
        response = requests.get(f'https://api.wordlift.io/id/{formatted_link}')
        response.raise_for_status() # Raise an exception for bad status codes
        return response.json()
    except requests.exceptions.RequestException as e:
        st.warning(f"Could not fetch data for entity: {entity_link}. Error: {e}")
        return None
            
# Create the form
with st.form(key='my_form'):
    text_input = st.text_area(label='Enter a sentence', value="Angela Merkel was the first female chancellor of Germany.")
    submit_button = st.form_submit_button(label='Analyze')

# Initialization
entities_map = {}
entities_data = {}

if text_input:
    if selected_language in ["German", "English - spaCy"]:
        doc = model(text_input)
        spacy_entities = [(ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata) for ent in doc.ents]
        for entity in spacy_entities:
            entity_string, entity_type, wikidata_id, wikidata_url = entity
            if wikidata_url:
                formatted_wikidata_url = wikidata_url.replace("https://www.wikidata.org/wiki/", "http://www.wikidata.org/entity/")
                entities_map[entity_string] = {"id": wikidata_id, "link": formatted_wikidata_url}
                entity_data = get_entity_data(formatted_wikidata_url)
                
                if entity_data is not None:
                    entities_data[entity_string] = entity_data
    else: # Refined model
        refined_entities = model.process_text(text_input)

        for entity in refined_entities:
            # More robustly access entity attributes instead of parsing a string
            if entity.entity_id and "wikidata" in entity.entity_id:
                entity_text = entity.text
                wikidata_info = get_wikidata_id(entity.entity_id)
                entities_map[entity_text] = wikidata_info
                entity_data = get_entity_data(wikidata_info["link"])
                if entity_data is not None:
                    entities_data[entity_text] = entity_data

    combined_entity_info_dictionary = {
        k: [entities_map[k], entities_data.get(k)] for k in entities_map
    }
    
    if submit_button:
        # A more robust way to build the annotated_text list without using eval()
        final_text = []
        current_pos = 0
        
        # Create a simple list of (text, start, end) for sorting
        entity_spans = []
        if selected_language in ["German", "English - spaCy"]:
            # 'doc' is available from the processing block above
            for ent in doc.ents:
                if ent.text in entities_map: # only include linked entities
                    entity_spans.append((ent.text, ent.start_char, ent.end_char))
        else:
            # 'refined_entities' is available
             for ent in refined_entities:
                if ent.text in entities_map:
                    entity_spans.append((ent.text, ent.span[0], ent.span[1]))

        # Sort entities by their starting position to handle the text correctly
        sorted_entities = sorted(entity_spans, key=lambda x: x[1])

        for entity_string, start, end in sorted_entities:
            # Add the text segment before the current entity
            final_text.append(text_input[current_pos:start])
            
            # Prepare the annotation for the entity
            entity_info = entities_map.get(entity_string, {})
            entity_id = entity_info.get("id", "N/A")
            
            entity_type_data = entities_data.get(entity_string)
            entity_type = entity_type_data.get("@type") if entity_type_data else None

            color = {"Place": "#8AC7DB", "Organization": "#ADD8E6", "Person": "#67B7D1", 
                     "Product": "#2ea3f2", "CreativeWork": "#00BFFF", "Event": "#1E90FF"}.get(entity_type, "#8ef")

            final_text.append((entity_string, entity_id, color))
            current_pos = end
        
        # Add any remaining text after the last entity
        final_text.append(text_input[current_pos:])

        st.header("Annotated Text")
        annotated_text(*[item for item in final_text if item]) # Filter out empty strings
        
        # --- JSON-LD Generation ---
        json_ld_data = {
                "@context": "https://schema.org",
                "@type": "WebPage",
                "mentions": []
            }
        for entity_string, info_list in combined_entity_info_dictionary.items():
            entity_json_ld = info_list[1] # The data from WordLift API
            if entity_json_ld:
                 json_ld_data["mentions"].append(entity_json_ld)
        
        with st.expander("See annotations"):
            st.write(combined_entity_info_dictionary)

        with st.expander("Here is the final JSON-LD"):
            st.json(json_ld_data)