File size: 2,659 Bytes
7f7285f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# -*- coding: utf-8 -*-

'''
@Author     : Jiangjie Chen
@Time       : 2020/5/11 19:08
@Contact    : [email protected]
@Description: 
'''

import os
import tagme


def read_title_id(entity_def_path):
    id_to_title = {}
    with open(entity_def_path, 'r', encoding='UTF-8') as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            if i > 0:
                entity, id = line.strip().split('|')
                id_to_title[id] = entity

    return id_to_title


class ELClient:
    def __init__(self, link_type, min_rho=0.1, prefix=None, verbose=False):
        self.verbose = verbose
        self.link_type = link_type
        if link_type == 'tagme':
            self.min_rho = min_rho
            tagme.GCUBE_TOKEN = os.environ['TAGME_APIKEY']
        elif link_type == 'spacy':
            assert prefix is not None
            self.init_spacy_linker(prefix)
        else:
            raise NotImplementedError(link_type)

    def init_spacy_linker(self, prefix):
        entity_def_path = f"{prefix}/entity_defs.csv"
        self._print('* Loading entity linker...')
        self.nlp = spacy.load(prefix)
        self.id2title = read_title_id(entity_def_path)
        self._print('* Entity linker loaded.')

    def _tagme_link(self, text):
        result = []
        for ann in tagme.annotate(text, long_text=1).get_annotations(min_rho=self.min_rho):
            result.append((text[ann.begin:ann.end], ann.score, ann.entity_id, ann.entity_title))
            # result.append({'begin': ann.begin,
            #                'end': ann.end,
            #                'id': ann.entity_id,
            #                'title': ann.entity_title,
            #                'score': ann.score})
        result.sort(key=lambda x: x[1], reverse=True)
        return result

    def link(self, text):
        if self.link_type == 'tagme':
            return self._tagme_link(text)
        else:
            return self._spacy_link(text)

    def _spacy_link(self, text):
        text = self._preprocess_text(text)
        doc = self.nlp(text)
        ents = [(e.text, e.label_, e.kb_id_, self.id2title.get(e.kb_id_, ''))
                for e in doc.ents if e.kb_id_ != 'NIL']
        return ents

    def _preprocess_text(self, text):
        if isinstance(text, list):
            text = ' '.join(text)
        text = text.strip().replace('-lrb-', '(').replace('-rrb-', ')')
        return text

    def _print(self, x):
        if self.verbose: print(x)


if __name__ == '__main__':
    elcl = ELClient(link_type='tagme', verbose=True)
    res = elcl.link('Jeff Dean wants to meet Yoshua Bengio.')
    print(res)