File size: 2,158 Bytes
e3f4d6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e568ca3
e3f4d6e
9a19160
 
 
 
 
 
 
e568ca3
 
 
 
 
 
 
 
 
 
9a19160
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import streamlit as st
import streamlit.components.v1 as components
import requests
import spacy
import hashlib

nlp = spacy.load("en_core_web_md")

# add pipeline (declared through entry_points in setup.py)
nlp.add_pipe("entityfishing")


st.title('Entity Linking Demo')


article = st.text_area('Article to analyze:', value=open("example.txt").read())

seen_entities = []
seen_surnames = []
if st.button('Submit'):
    good_ents = []

    with st.spinner(text="Analysing..."):
        doc = nlp(article)
        for ent in doc.ents:
            if ent._.kb_qid is None or ent.label_ not in ["ORG", "PERSON", "GPE"] or ent.text in seen_entities:
                continue

            if ent.label_ == "PERSON":
                if len(ent.text.split()) == 1:
                    # Single name
                    if ent.text in seen_surnames:
                        continue
                else:
                    # Multipart name
                    seen_surnames.append(ent.text.split()[-1])

            seen_entities.append(ent.text)
            print((ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata, ent._.nerd_score))
            r = requests.get("https://www.wikidata.org/w/api.php?action=wbgetclaims&format=json&property=P18&entity=" + ent._.kb_qid)
            data = r.json()["claims"]
            if "P18" in data.keys():
                data = data["P18"][0]["mainsnak"]
                img_name = data["datavalue"]["value"].replace(" ", "_")
                img_name_hash = hashlib.md5(img_name.encode("utf-8")).hexdigest()
                a = img_name_hash[0]
                b = img_name_hash[1]
                url= f"https://upload.wikimedia.org/wikipedia/commons/{a}/{a}{b}/{img_name}"
                good_ents.append((ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata, ent._.nerd_score, url))
        cols = st.columns(len(good_ents))
        for i, ent in enumerate(good_ents):
            # st.image(url)
            with cols[i]:
                components.html(f"<image style='border-radius: 50%;object-fit:cover;width:100px;height:100px' src='{ent[-1]}'/>", height=110, width=110)
                st.caption(ent[0])