File size: 1,334 Bytes
6d139f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from underthesea import sent_tokenize


def substring(w, ls):
    for w2 in ls:
        if w != w2 and w in w2:
            return True
    return False


def get_ner_phrases(sent_ner_result):
    ner_list = []
    current_ner = [sent_ner_result[0]["word"]]
    current_idx = sent_ner_result[0]["index"]
    for i in range(1, len(sent_ner_result)):
        if sent_ner_result[i]["index"] == current_idx + 1:
            current_ner.append(sent_ner_result[i]["word"])
        else:
            ner_list.append((' '.join(current_ner), sent_ner_result[i - 1]['entity']))
            current_ner = [sent_ner_result[i]["word"]]

        current_idx = sent_ner_result[i]["index"]

    ner_list.append((' '.join(current_ner), sent_ner_result[len(sent_ner_result) - 1]['entity']))
    return ner_list


def get_named_entities(nlp, doc):
    ner_lists = []
    for sent in sent_tokenize(doc):
        sent_ner_result = nlp(sent)
        if len(sent_ner_result) > 0:
            ner_lists += get_ner_phrases(sent_ner_result)

    ner_list_non_dup = []
    for (entity, ner_type) in ner_lists:
        if entity not in ner_list_non_dup and ner_type.startswith('I'):
            ner_list_non_dup.append(entity)

    ner_list_final = [w.replace(" ##", "") for w in ner_list_non_dup if not substring(w, ner_list_non_dup)]
    return ner_list_final