File size: 1,877 Bytes

d6504ae
8b513d0

from .TermCandidate import TermCandidate


class TermCandidateExtractor:
    def __init__(self, doc):
        self.doc = doc

    def __iter__(self):
        for sent in self.doc.sents:
            for candidate in self._get_candidates_in_sent(sent, self.doc):
                yield candidate

    def _get_candidates_in_sent(self, sent, doc):
        root = list(filter(lambda token: token.dep_ == "ROOT", sent))[0]

        excluded_children = []
        candidates = []

        def get_candidates(node, doc):

            if (node.pos_ in ["PROPN", "NOUN"]) and node.pos_ not in ["PRON"]:
                term_candidates = TermCandidate(doc[node.i:node.i + 1])

                for child in node.children:

                    start_index = min(node.i, child.i)
                    end_index = max(node.i, child.i)

                    if child.dep_ == "compound" or child.dep_ == "amod":
                        subtree_tokens = list(child.subtree)
                        if all([c.dep_ == "compound" for c in subtree_tokens]):
                            start_index = min([c.i for c in subtree_tokens])
                        term_candidates.append(doc[start_index:end_index + 1])

                        if not child.dep_ == "amod":
                            term_candidates.append(doc[start_index:start_index + 1])
                        excluded_children.append(child)

                    if child.dep_ == "prep" and child.text == "of":
                        end_index = max([c.i for c in child.subtree])
                        term_candidates.append(doc[start_index:end_index + 1])

                candidates.append(term_candidates)

            for child in node.children:
                if child in excluded_children:
                    continue
                get_candidates(child, doc)

        get_candidates(root, doc)

        return candidates