File size: 1,169 Bytes
7f7285f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# -*- coding: utf-8 -*-

'''
@Author     : Jiangjie Chen
@Time       : 2020/11/12 21:19
@Contact    : [email protected]
@Description: 
'''

import wikipediaapi
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
try:
    from entitylinker import ELClient
except:
    from .entitylinker import ELClient


class DocRetrieval:
    def __init__(self, link_type):
        self.wiki = wikipediaapi.Wikipedia('en')
        self.er_client = ELClient(link_type, verbose=True)

    def _get_page(self, title):
        summary = self.wiki.page(title).summary
        sents = []
        for i, sent in enumerate(sent_tokenize(summary)):
            sents.append((title, i, sent, 0))
        return sents

    def retrieve_docs(self, claim):
        el_results = self.er_client.link(claim)
        sents = []
        for text, label, kb_id, title in el_results:
            if title == '': continue
            sents += self._get_page(title)
        return sents


if __name__ == '__main__':
    doc = DocRetrieval('tagme')
    print(doc.retrieve_docs('joe biden won the U.S. president.'))
    print(doc.retrieve_docs('Joe Biden won the U.S. president.'))