File size: 1,877 Bytes
d6504ae 8b513d0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
from .TermCandidate import TermCandidate
class TermCandidateExtractor:
def __init__(self, doc):
self.doc = doc
def __iter__(self):
for sent in self.doc.sents:
for candidate in self._get_candidates_in_sent(sent, self.doc):
yield candidate
def _get_candidates_in_sent(self, sent, doc):
root = list(filter(lambda token: token.dep_ == "ROOT", sent))[0]
excluded_children = []
candidates = []
def get_candidates(node, doc):
if (node.pos_ in ["PROPN", "NOUN"]) and node.pos_ not in ["PRON"]:
term_candidates = TermCandidate(doc[node.i:node.i + 1])
for child in node.children:
start_index = min(node.i, child.i)
end_index = max(node.i, child.i)
if child.dep_ == "compound" or child.dep_ == "amod":
subtree_tokens = list(child.subtree)
if all([c.dep_ == "compound" for c in subtree_tokens]):
start_index = min([c.i for c in subtree_tokens])
term_candidates.append(doc[start_index:end_index + 1])
if not child.dep_ == "amod":
term_candidates.append(doc[start_index:start_index + 1])
excluded_children.append(child)
if child.dep_ == "prep" and child.text == "of":
end_index = max([c.i for c in child.subtree])
term_candidates.append(doc[start_index:end_index + 1])
candidates.append(term_candidates)
for child in node.children:
if child in excluded_children:
continue
get_candidates(child, doc)
get_candidates(root, doc)
return candidates
|