Spaces:
Runtime error
Runtime error
File size: 1,334 Bytes
6d139f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
from underthesea import sent_tokenize
def substring(w, ls):
for w2 in ls:
if w != w2 and w in w2:
return True
return False
def get_ner_phrases(sent_ner_result):
ner_list = []
current_ner = [sent_ner_result[0]["word"]]
current_idx = sent_ner_result[0]["index"]
for i in range(1, len(sent_ner_result)):
if sent_ner_result[i]["index"] == current_idx + 1:
current_ner.append(sent_ner_result[i]["word"])
else:
ner_list.append((' '.join(current_ner), sent_ner_result[i - 1]['entity']))
current_ner = [sent_ner_result[i]["word"]]
current_idx = sent_ner_result[i]["index"]
ner_list.append((' '.join(current_ner), sent_ner_result[len(sent_ner_result) - 1]['entity']))
return ner_list
def get_named_entities(nlp, doc):
ner_lists = []
for sent in sent_tokenize(doc):
sent_ner_result = nlp(sent)
if len(sent_ner_result) > 0:
ner_lists += get_ner_phrases(sent_ner_result)
ner_list_non_dup = []
for (entity, ner_type) in ner_lists:
if entity not in ner_list_non_dup and ner_type.startswith('I'):
ner_list_non_dup.append(entity)
ner_list_final = [w.replace(" ##", "") for w in ner_list_non_dup if not substring(w, ner_list_non_dup)]
return ner_list_final
|