import streamlit as st
from annotated_text import annotated_text
from refined.inference.processor import Refined
import requests
import json
import spacy

# Page config
st.set_page_config(
    page_title="Entity Linking by WordLift",
    page_icon="fav-ico.png",
    layout="wide",
    initial_sidebar_state="collapsed",
    menu_items={
        'Get Help': 'https://wordlift.io/book-a-demo/',
        'About': "# This is a demo app for NEL/NED/NER and SEO"
    }
)

# Sidebar
st.sidebar.image("logo-wordlift.png")
language_options = {"English", "English - spaCy", "German"}
# Set default to English to avoid an error on the first run
selected_language = st.sidebar.selectbox("Select the Language", list(language_options), index=0)

# Initialize model and entity set variables
selected_model_name = None
selected_entity_set = None

# Based on selected language, configure model, entity set, and citation options
if selected_language == "German" or selected_language == "English - spaCy":
    entity_fishing_citation = """
    @misc{entity-fishing,
    title = {entity-fishing},
    publisher = {GitHub},
    year = {2016--2023},
    archivePrefix = {swh},
    eprint = {1:dir:cb0ba3379413db12b0018b7c3af8d0d2d864139c}
    }
    """
    with st.sidebar.expander('Citations'):
        st.markdown(entity_fishing_citation)
else: # English (Refined)
    model_options = ["aida_model", "wikipedia_model_with_numbers"]
    entity_set_options = ["wikidata", "wikipedia"]
    
    selected_model_name = st.sidebar.selectbox("Select the Model", model_options)
    selected_entity_set = st.sidebar.selectbox("Select the Entity Set", entity_set_options)

    refined_citation = """
    @inproceedings{ayoola-etal-2022-refined,
    title = "{R}e{F}in{ED}: An Efficient Zero-shot-capable Approach to End-to-End Entity Linking",
    author = "Tom Ayoola, Shubhi Tyagi, Joseph Fisher, Christos Christodoulopoulos, Andrea Pierleoni",
    booktitle = "NAACL",
    year = "2022"
    }
    """
    with st.sidebar.expander('Citations'):
        st.markdown(refined_citation)

@st.cache_resource
def load_model(selected_language, model_name=None, entity_set=None):
    # Define the public URL for the entity-fishing service
    entity_fishing_url = "https://cloud.science-miner.com/nerd/service"
    
    if selected_language == "German":
        # Load the German-specific model
        nlp_model_de = spacy.load("de_core_news_lg")
        # Add the entity-fishing pipe with the server URL configured
        nlp_model_de.add_pipe("entityfishing", config={"api_url": entity_fishing_url})
        return nlp_model_de
        
    elif selected_language == "English - spaCy":
        # Load English-specific model
        nlp_model_en = spacy.load("en_core_web_sm")
        # Add the entity-fishing pipe with the server URL configured
        nlp_model_en.add_pipe("entityfishing", config={"api_url": entity_fishing_url})
        return nlp_model_en
        
    else: # English (Refined)
        # Load the pretrained model for other languages
        refined_model = Refined.from_pretrained(model_name=model_name, entity_set=entity_set)
        return refined_model

# Use the cached model
# We pass the selected options directly to the cached function
# Streamlit's caching handles re-running this only when the inputs change
model = load_model(selected_language, selected_model_name, selected_entity_set)

# Helper functions
def get_wikidata_id(entity_id_string):
    # Handles IDs like "wikidata:Q123" or "wikidata=Q123"
    entity_id = entity_id_string.split(":")[-1].split("=")[-1]
    entity_link = "http://www.wikidata.org/entity/" + entity_id
    return {"id": entity_id, "link": entity_link}
    
def get_entity_data(entity_link):
    try:
        # Format the entity_link
        formatted_link = entity_link.replace("http://", "http/")
        response = requests.get(f'https://api.wordlift.io/id/{formatted_link}')
        response.raise_for_status() # Raise an exception for bad status codes
        return response.json()
    except requests.exceptions.RequestException as e:
        st.warning(f"Could not fetch data for entity: {entity_link}. Error: {e}")
        return None
            
# Create the form
with st.form(key='my_form'):
    text_input = st.text_area(label='Enter a sentence', value="Angela Merkel was the first female chancellor of Germany.")
    submit_button = st.form_submit_button(label='Analyze')

# Initialization
entities_map = {}
entities_data = {}

if text_input:
    if selected_language in ["German", "English - spaCy"]:
        doc = model(text_input)
        spacy_entities = [(ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata) for ent in doc.ents]
        for entity in spacy_entities:
            entity_string, entity_type, wikidata_id, wikidata_url = entity
            if wikidata_url:
                formatted_wikidata_url = wikidata_url.replace("https://www.wikidata.org/wiki/", "http://www.wikidata.org/entity/")
                entities_map[entity_string] = {"id": wikidata_id, "link": formatted_wikidata_url}
                entity_data = get_entity_data(formatted_wikidata_url)
                
                if entity_data is not None:
                    entities_data[entity_string] = entity_data
    else: # Refined model
        refined_entities = model.process_text(text_input)

        for entity in refined_entities:
            # More robustly access entity attributes instead of parsing a string
            if entity.entity_id and "wikidata" in entity.entity_id:
                entity_text = entity.text
                wikidata_info = get_wikidata_id(entity.entity_id)
                entities_map[entity_text] = wikidata_info
                entity_data = get_entity_data(wikidata_info["link"])
                if entity_data is not None:
                    entities_data[entity_text] = entity_data

    combined_entity_info_dictionary = {
        k: [entities_map[k], entities_data.get(k)] for k in entities_map
    }
    
    if submit_button:
        # A more robust way to build the annotated_text list without using eval()
        final_text = []
        current_pos = 0
        
        # Create a simple list of (text, start, end) for sorting
        entity_spans = []
        if selected_language in ["German", "English - spaCy"]:
            # 'doc' is available from the processing block above
            for ent in doc.ents:
                if ent.text in entities_map: # only include linked entities
                    entity_spans.append((ent.text, ent.start_char, ent.end_char))
        else:
            # 'refined_entities' is available
             for ent in refined_entities:
                if ent.text in entities_map:
                    entity_spans.append((ent.text, ent.span[0], ent.span[1]))

        # Sort entities by their starting position to handle the text correctly
        sorted_entities = sorted(entity_spans, key=lambda x: x[1])

        for entity_string, start, end in sorted_entities:
            # Add the text segment before the current entity
            final_text.append(text_input[current_pos:start])
            
            # Prepare the annotation for the entity
            entity_info = entities_map.get(entity_string, {})
            entity_id = entity_info.get("id", "N/A")
            
            entity_type_data = entities_data.get(entity_string)
            entity_type = entity_type_data.get("@type") if entity_type_data else None

            color = {"Place": "#8AC7DB", "Organization": "#ADD8E6", "Person": "#67B7D1", 
                     "Product": "#2ea3f2", "CreativeWork": "#00BFFF", "Event": "#1E90FF"}.get(entity_type, "#8ef")

            final_text.append((entity_string, entity_id, color))
            current_pos = end
        
        # Add any remaining text after the last entity
        final_text.append(text_input[current_pos:])

        st.header("Annotated Text")
        annotated_text(*[item for item in final_text if item]) # Filter out empty strings
        
        # --- JSON-LD Generation ---
        json_ld_data = {
                "@context": "https://schema.org",
                "@type": "WebPage",
                "mentions": []
            }
        for entity_string, info_list in combined_entity_info_dictionary.items():
            entity_json_ld = info_list[1] # The data from WordLift API
            if entity_json_ld:
                 json_ld_data["mentions"].append(entity_json_ld)
        
        with st.expander("See annotations"):
            st.write(combined_entity_info_dictionary)

        with st.expander("Here is the final JSON-LD"):
            st.json(json_ld_data)