Spaces:

sunnychenxiwang
/

EasyDetect

Sleeping

File size: 19,312 Bytes

d916065

# Natural Language Toolkit: Interface to the Stanford Parser
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Steven Xu <[email protected]>
#
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

import os
import tempfile
import warnings
from subprocess import PIPE

from nltk.internals import (
    _java_options,
    config_java,
    find_jar_iter,
    find_jars_within_path,
    java,
)
from nltk.parse.api import ParserI
from nltk.parse.dependencygraph import DependencyGraph
from nltk.tree import Tree

_stanford_url = "https://nlp.stanford.edu/software/lex-parser.shtml"


class GenericStanfordParser(ParserI):
    """Interface to the Stanford Parser"""

    _MODEL_JAR_PATTERN = r"stanford-parser-(\d+)(\.(\d+))+-models\.jar"
    _JAR = r"stanford-parser\.jar"
    _MAIN_CLASS = "edu.stanford.nlp.parser.lexparser.LexicalizedParser"

    _USE_STDIN = False
    _DOUBLE_SPACED_OUTPUT = False

    def __init__(

        self,

        path_to_jar=None,

        path_to_models_jar=None,

        model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz",

        encoding="utf8",

        verbose=False,

        java_options="-mx4g",

        corenlp_options="",

    ):

        # find the most recent code and model jar
        stanford_jar = max(
            find_jar_iter(
                self._JAR,
                path_to_jar,
                env_vars=("STANFORD_PARSER", "STANFORD_CORENLP"),
                searchpath=(),
                url=_stanford_url,
                verbose=verbose,
                is_regex=True,
            ),
            key=lambda model_path: os.path.dirname(model_path),
        )

        model_jar = max(
            find_jar_iter(
                self._MODEL_JAR_PATTERN,
                path_to_models_jar,
                env_vars=("STANFORD_MODELS", "STANFORD_CORENLP"),
                searchpath=(),
                url=_stanford_url,
                verbose=verbose,
                is_regex=True,
            ),
            key=lambda model_path: os.path.dirname(model_path),
        )

        # self._classpath = (stanford_jar, model_jar)

        # Adding logging jar files to classpath
        stanford_dir = os.path.split(stanford_jar)[0]
        self._classpath = tuple([model_jar] + find_jars_within_path(stanford_dir))

        self.model_path = model_path
        self._encoding = encoding
        self.corenlp_options = corenlp_options
        self.java_options = java_options

    def _parse_trees_output(self, output_):
        res = []
        cur_lines = []
        cur_trees = []
        blank = False
        for line in output_.splitlines(False):
            if line == "":
                if blank:
                    res.append(iter(cur_trees))
                    cur_trees = []
                    blank = False
                elif self._DOUBLE_SPACED_OUTPUT:
                    cur_trees.append(self._make_tree("\n".join(cur_lines)))
                    cur_lines = []
                    blank = True
                else:
                    res.append(iter([self._make_tree("\n".join(cur_lines))]))
                    cur_lines = []
            else:
                cur_lines.append(line)
                blank = False
        return iter(res)

    def parse_sents(self, sentences, verbose=False):
        """

        Use StanfordParser to parse multiple sentences. Takes multiple sentences as a

        list where each sentence is a list of words.

        Each sentence will be automatically tagged with this StanfordParser instance's

        tagger.

        If whitespaces exists inside a token, then the token will be treated as

        separate tokens.



        :param sentences: Input sentences to parse

        :type sentences: list(list(str))

        :rtype: iter(iter(Tree))

        """
        cmd = [
            self._MAIN_CLASS,
            "-model",
            self.model_path,
            "-sentences",
            "newline",
            "-outputFormat",
            self._OUTPUT_FORMAT,
            "-tokenized",
            "-escaper",
            "edu.stanford.nlp.process.PTBEscapingProcessor",
        ]
        return self._parse_trees_output(
            self._execute(
                cmd, "\n".join(" ".join(sentence) for sentence in sentences), verbose
            )
        )

    def raw_parse(self, sentence, verbose=False):
        """

        Use StanfordParser to parse a sentence. Takes a sentence as a string;

        before parsing, it will be automatically tokenized and tagged by

        the Stanford Parser.



        :param sentence: Input sentence to parse

        :type sentence: str

        :rtype: iter(Tree)

        """
        return next(self.raw_parse_sents([sentence], verbose))

    def raw_parse_sents(self, sentences, verbose=False):
        """

        Use StanfordParser to parse multiple sentences. Takes multiple sentences as a

        list of strings.

        Each sentence will be automatically tokenized and tagged by the Stanford Parser.



        :param sentences: Input sentences to parse

        :type sentences: list(str)

        :rtype: iter(iter(Tree))

        """
        cmd = [
            self._MAIN_CLASS,
            "-model",
            self.model_path,
            "-sentences",
            "newline",
            "-outputFormat",
            self._OUTPUT_FORMAT,
        ]
        return self._parse_trees_output(
            self._execute(cmd, "\n".join(sentences), verbose)
        )

    def tagged_parse(self, sentence, verbose=False):
        """

        Use StanfordParser to parse a sentence. Takes a sentence as a list of

        (word, tag) tuples; the sentence must have already been tokenized and

        tagged.



        :param sentence: Input sentence to parse

        :type sentence: list(tuple(str, str))

        :rtype: iter(Tree)

        """
        return next(self.tagged_parse_sents([sentence], verbose))

    def tagged_parse_sents(self, sentences, verbose=False):
        """

        Use StanfordParser to parse multiple sentences. Takes multiple sentences

        where each sentence is a list of (word, tag) tuples.

        The sentences must have already been tokenized and tagged.



        :param sentences: Input sentences to parse

        :type sentences: list(list(tuple(str, str)))

        :rtype: iter(iter(Tree))

        """
        tag_separator = "/"
        cmd = [
            self._MAIN_CLASS,
            "-model",
            self.model_path,
            "-sentences",
            "newline",
            "-outputFormat",
            self._OUTPUT_FORMAT,
            "-tokenized",
            "-tagSeparator",
            tag_separator,
            "-tokenizerFactory",
            "edu.stanford.nlp.process.WhitespaceTokenizer",
            "-tokenizerMethod",
            "newCoreLabelTokenizerFactory",
        ]
        # We don't need to escape slashes as "splitting is done on the last instance of the character in the token"
        return self._parse_trees_output(
            self._execute(
                cmd,
                "\n".join(
                    " ".join(tag_separator.join(tagged) for tagged in sentence)
                    for sentence in sentences
                ),
                verbose,
            )
        )

    def _execute(self, cmd, input_, verbose=False):
        encoding = self._encoding
        cmd.extend(["-encoding", encoding])
        if self.corenlp_options:
            cmd.extend(self.corenlp_options.split())

        default_options = " ".join(_java_options)

        # Configure java.
        config_java(options=self.java_options, verbose=verbose)

        # Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
        with tempfile.NamedTemporaryFile(mode="wb", delete=False) as input_file:
            # Write the actual sentences to the temporary input file
            if isinstance(input_, str) and encoding:
                input_ = input_.encode(encoding)
            input_file.write(input_)
            input_file.flush()

            # Run the tagger and get the output.
            if self._USE_STDIN:
                input_file.seek(0)
                stdout, stderr = java(
                    cmd,
                    classpath=self._classpath,
                    stdin=input_file,
                    stdout=PIPE,
                    stderr=PIPE,
                )
            else:
                cmd.append(input_file.name)
                stdout, stderr = java(
                    cmd, classpath=self._classpath, stdout=PIPE, stderr=PIPE
                )

            stdout = stdout.replace(b"\xc2\xa0", b" ")
            stdout = stdout.replace(b"\x00\xa0", b" ")
            stdout = stdout.decode(encoding)

        os.unlink(input_file.name)

        # Return java configurations to their default values.
        config_java(options=default_options, verbose=False)

        return stdout


class StanfordParser(GenericStanfordParser):
    """

    >>> parser=StanfordParser(

    ...     model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"

    ... ) # doctest: +SKIP



    >>> list(parser.raw_parse("the quick brown fox jumps over the lazy dog")) # doctest: +NORMALIZE_WHITESPACE +SKIP

    [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),

    Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),

    Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])])]



    >>> sum([list(dep_graphs) for dep_graphs in parser.raw_parse_sents((

    ...     "the quick brown fox jumps over the lazy dog",

    ...     "the quick grey wolf jumps over the lazy fox"

    ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP

    [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),

    Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),

    Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])]), Tree('ROOT', [Tree('NP',

    [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['grey']), Tree('NN', ['wolf'])]), Tree('NP',

    [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']),

    Tree('JJ', ['lazy']), Tree('NN', ['fox'])])])])])])]



    >>> sum([list(dep_graphs) for dep_graphs in parser.parse_sents((

    ...     "I 'm a dog".split(),

    ...     "This is my friends ' cat ( the tabby )".split(),

    ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP

    [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('PRP', ['I'])]), Tree('VP', [Tree('VBP', ["'m"]),

    Tree('NP', [Tree('DT', ['a']), Tree('NN', ['dog'])])])])]), Tree('ROOT', [Tree('S', [Tree('NP',

    [Tree('DT', ['This'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('NP', [Tree('PRP$', ['my']),

    Tree('NNS', ['friends']), Tree('POS', ["'"])]), Tree('NN', ['cat'])]), Tree('PRN', [Tree('-LRB-', [Tree('', []),

    Tree('NP', [Tree('DT', ['the']), Tree('NN', ['tabby'])]), Tree('-RRB-', [])])])])])])])]



    >>> sum([list(dep_graphs) for dep_graphs in parser.tagged_parse_sents((

    ...     (

    ...         ("The", "DT"),

    ...         ("quick", "JJ"),

    ...         ("brown", "JJ"),

    ...         ("fox", "NN"),

    ...         ("jumped", "VBD"),

    ...         ("over", "IN"),

    ...         ("the", "DT"),

    ...         ("lazy", "JJ"),

    ...         ("dog", "NN"),

    ...         (".", "."),

    ...     ),

    ... ))],[]) # doctest: +NORMALIZE_WHITESPACE +SKIP

    [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['The']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),

    Tree('NN', ['fox'])]), Tree('VP', [Tree('VBD', ['jumped']), Tree('PP', [Tree('IN', ['over']), Tree('NP',

    [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])]

    """

    _OUTPUT_FORMAT = "penn"

    def __init__(self, *args, **kwargs):
        warnings.warn(
            "The StanfordParser will be deprecated\n"
            "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead.",
            DeprecationWarning,
            stacklevel=2,
        )

        super().__init__(*args, **kwargs)

    def _make_tree(self, result):
        return Tree.fromstring(result)


class StanfordDependencyParser(GenericStanfordParser):

    """

    >>> dep_parser=StanfordDependencyParser(

    ...     model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"

    ... ) # doctest: +SKIP



    >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP

    [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])])]



    >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP

    [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),

    ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),

    ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),

    ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]



    >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents((

    ...     "The quick brown fox jumps over the lazy dog.",

    ...     "The quick grey wolf jumps over the lazy fox."

    ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP

    [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])]),

    Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), Tree('fox', ['over', 'the', 'lazy'])])]



    >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents((

    ...     "I 'm a dog".split(),

    ...     "This is my friends ' cat ( the tabby )".split(),

    ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP

    [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', ['my', "'"]), Tree('tabby', ['the'])])]



    >>> sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents((

    ...     (

    ...         ("The", "DT"),

    ...         ("quick", "JJ"),

    ...         ("brown", "JJ"),

    ...         ("fox", "NN"),

    ...         ("jumped", "VBD"),

    ...         ("over", "IN"),

    ...         ("the", "DT"),

    ...         ("lazy", "JJ"),

    ...         ("dog", "NN"),

    ...         (".", "."),

    ...     ),

    ... ))],[]) # doctest: +NORMALIZE_WHITESPACE +SKIP

    [[((u'jumped', u'VBD'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),

    ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),

    ((u'jumped', u'VBD'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),

    ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]



    """

    _OUTPUT_FORMAT = "conll2007"

    def __init__(self, *args, **kwargs):
        warnings.warn(
            "The StanfordDependencyParser will be deprecated\n"
            "Please use \033[91mnltk.parse.corenlp.CoreNLPDependencyParser\033[0m instead.",
            DeprecationWarning,
            stacklevel=2,
        )

        super().__init__(*args, **kwargs)

    def _make_tree(self, result):
        return DependencyGraph(result, top_relation_label="root")


class StanfordNeuralDependencyParser(GenericStanfordParser):
    """

    >>> from nltk.parse.stanford import StanfordNeuralDependencyParser # doctest: +SKIP

    >>> dep_parser=StanfordNeuralDependencyParser(java_options='-mx4g')# doctest: +SKIP



    >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP

    [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy']), '.'])]



    >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP

    [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det',

    (u'The', u'DT')), ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'),

    u'amod', (u'brown', u'JJ')), ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')),

    ((u'dog', u'NN'), u'case', (u'over', u'IN')), ((u'dog', u'NN'), u'det',

    (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ')), ((u'jumps', u'VBZ'),

    u'punct', (u'.', u'.'))]]



    >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents((

    ...     "The quick brown fox jumps over the lazy dog.",

    ...     "The quick grey wolf jumps over the lazy fox."

    ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP

    [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over',

    'the', 'lazy']), '.']), Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']),

    Tree('fox', ['over', 'the', 'lazy']), '.'])]



    >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents((

    ...     "I 'm a dog".split(),

    ...     "This is my friends ' cat ( the tabby )".split(),

    ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP

    [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends',

    ['my', "'"]), Tree('tabby', ['-LRB-', 'the', '-RRB-'])])]

    """

    _OUTPUT_FORMAT = "conll"
    _MAIN_CLASS = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
    _JAR = r"stanford-corenlp-(\d+)(\.(\d+))+\.jar"
    _MODEL_JAR_PATTERN = r"stanford-corenlp-(\d+)(\.(\d+))+-models\.jar"
    _USE_STDIN = True
    _DOUBLE_SPACED_OUTPUT = True

    def __init__(self, *args, **kwargs):
        warnings.warn(
            "The StanfordNeuralDependencyParser will be deprecated\n"
            "Please use \033[91mnltk.parse.corenlp.CoreNLPDependencyParser\033[0m instead.",
            DeprecationWarning,
            stacklevel=2,
        )

        super().__init__(*args, **kwargs)
        self.corenlp_options += "-annotators tokenize,ssplit,pos,depparse"

    def tagged_parse_sents(self, sentences, verbose=False):
        """

        Currently unimplemented because the neural dependency parser (and

        the StanfordCoreNLP pipeline class) doesn't support passing in pre-

        tagged tokens.

        """
        raise NotImplementedError(
            "tagged_parse[_sents] is not supported by "
            "StanfordNeuralDependencyParser; use "
            "parse[_sents] or raw_parse[_sents] instead."
        )

    def _make_tree(self, result):
        return DependencyGraph(result, top_relation_label="ROOT")