Spaces:

sunnychenxiwang
/

EasyDetect

Sleeping

App Files Files Community

EasyDetect / pipeline /nltk /parse /stanford.py

sunnychenxiwang

update nltk

d916065 over 1 year ago

raw

history blame

19.3 kB

	# Natural Language Toolkit: Interface to the Stanford Parser
	#
	# Copyright (C) 2001-2023 NLTK Project
	# Author: Steven Xu <[email protected]>
	#
	# URL: <https://www.nltk.org/>
	# For license information, see LICENSE.TXT

	import os
	import tempfile
	import warnings
	from subprocess import PIPE

	from nltk.internals import (
	_java_options,
	config_java,
	find_jar_iter,
	find_jars_within_path,
	java,
	)
	from nltk.parse.api import ParserI
	from nltk.parse.dependencygraph import DependencyGraph
	from nltk.tree import Tree

	_stanford_url = "https://nlp.stanford.edu/software/lex-parser.shtml"


	class GenericStanfordParser(ParserI):
	"""Interface to the Stanford Parser"""

	_MODEL_JAR_PATTERN = r"stanford-parser-(\d+)(\.(\d+))+-models\.jar"
	_JAR = r"stanford-parser\.jar"
	_MAIN_CLASS = "edu.stanford.nlp.parser.lexparser.LexicalizedParser"

	_USE_STDIN = False
	_DOUBLE_SPACED_OUTPUT = False

	def __init__(
	self,
	path_to_jar=None,
	path_to_models_jar=None,
	model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz",
	encoding="utf8",
	verbose=False,
	java_options="-mx4g",
	corenlp_options="",
	):

	# find the most recent code and model jar
	stanford_jar = max(
	find_jar_iter(
	self._JAR,
	path_to_jar,
	env_vars=("STANFORD_PARSER", "STANFORD_CORENLP"),
	searchpath=(),
	url=_stanford_url,
	verbose=verbose,
	is_regex=True,
	),
	key=lambda model_path: os.path.dirname(model_path),
	)

	model_jar = max(
	find_jar_iter(
	self._MODEL_JAR_PATTERN,
	path_to_models_jar,
	env_vars=("STANFORD_MODELS", "STANFORD_CORENLP"),
	searchpath=(),
	url=_stanford_url,
	verbose=verbose,
	is_regex=True,
	),
	key=lambda model_path: os.path.dirname(model_path),
	)

	# self._classpath = (stanford_jar, model_jar)

	# Adding logging jar files to classpath
	stanford_dir = os.path.split(stanford_jar)[0]
	self._classpath = tuple([model_jar] + find_jars_within_path(stanford_dir))

	self.model_path = model_path
	self._encoding = encoding
	self.corenlp_options = corenlp_options
	self.java_options = java_options

	def _parse_trees_output(self, output_):
	res = []
	cur_lines = []
	cur_trees = []
	blank = False
	for line in output_.splitlines(False):
	if line == "":
	if blank:
	res.append(iter(cur_trees))
	cur_trees = []
	blank = False
	elif self._DOUBLE_SPACED_OUTPUT:
	cur_trees.append(self._make_tree("\n".join(cur_lines)))
	cur_lines = []
	blank = True
	else:
	res.append(iter([self._make_tree("\n".join(cur_lines))]))
	cur_lines = []
	else:
	cur_lines.append(line)
	blank = False
	return iter(res)

	def parse_sents(self, sentences, verbose=False):
	"""
	Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
	list where each sentence is a list of words.
	Each sentence will be automatically tagged with this StanfordParser instance's
	tagger.
	If whitespaces exists inside a token, then the token will be treated as
	separate tokens.

	:param sentences: Input sentences to parse
	:type sentences: list(list(str))
	:rtype: iter(iter(Tree))
	"""
	cmd = [
	self._MAIN_CLASS,
	"-model",
	self.model_path,
	"-sentences",
	"newline",
	"-outputFormat",
	self._OUTPUT_FORMAT,
	"-tokenized",
	"-escaper",
	"edu.stanford.nlp.process.PTBEscapingProcessor",
	]
	return self._parse_trees_output(
	self._execute(
	cmd, "\n".join(" ".join(sentence) for sentence in sentences), verbose
	)
	)

	def raw_parse(self, sentence, verbose=False):
	"""
	Use StanfordParser to parse a sentence. Takes a sentence as a string;
	before parsing, it will be automatically tokenized and tagged by
	the Stanford Parser.

	:param sentence: Input sentence to parse
	:type sentence: str
	:rtype: iter(Tree)
	"""
	return next(self.raw_parse_sents([sentence], verbose))

	def raw_parse_sents(self, sentences, verbose=False):
	"""
	Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
	list of strings.
	Each sentence will be automatically tokenized and tagged by the Stanford Parser.

	:param sentences: Input sentences to parse
	:type sentences: list(str)
	:rtype: iter(iter(Tree))
	"""
	cmd = [
	self._MAIN_CLASS,
	"-model",
	self.model_path,
	"-sentences",
	"newline",
	"-outputFormat",
	self._OUTPUT_FORMAT,
	]
	return self._parse_trees_output(
	self._execute(cmd, "\n".join(sentences), verbose)
	)

	def tagged_parse(self, sentence, verbose=False):
	"""
	Use StanfordParser to parse a sentence. Takes a sentence as a list of
	(word, tag) tuples; the sentence must have already been tokenized and
	tagged.

	:param sentence: Input sentence to parse
	:type sentence: list(tuple(str, str))
	:rtype: iter(Tree)
	"""
	return next(self.tagged_parse_sents([sentence], verbose))

	def tagged_parse_sents(self, sentences, verbose=False):
	"""
	Use StanfordParser to parse multiple sentences. Takes multiple sentences
	where each sentence is a list of (word, tag) tuples.
	The sentences must have already been tokenized and tagged.

	:param sentences: Input sentences to parse
	:type sentences: list(list(tuple(str, str)))
	:rtype: iter(iter(Tree))
	"""
	tag_separator = "/"
	cmd = [
	self._MAIN_CLASS,
	"-model",
	self.model_path,
	"-sentences",
	"newline",
	"-outputFormat",
	self._OUTPUT_FORMAT,
	"-tokenized",
	"-tagSeparator",
	tag_separator,
	"-tokenizerFactory",
	"edu.stanford.nlp.process.WhitespaceTokenizer",
	"-tokenizerMethod",
	"newCoreLabelTokenizerFactory",
	]
	# We don't need to escape slashes as "splitting is done on the last instance of the character in the token"
	return self._parse_trees_output(
	self._execute(
	cmd,
	"\n".join(
	" ".join(tag_separator.join(tagged) for tagged in sentence)
	for sentence in sentences
	),
	verbose,
	)
	)

	def _execute(self, cmd, input_, verbose=False):
	encoding = self._encoding
	cmd.extend(["-encoding", encoding])
	if self.corenlp_options:
	cmd.extend(self.corenlp_options.split())

	default_options = " ".join(_java_options)

	# Configure java.
	config_java(options=self.java_options, verbose=verbose)

	# Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
	with tempfile.NamedTemporaryFile(mode="wb", delete=False) as input_file:
	# Write the actual sentences to the temporary input file
	if isinstance(input_, str) and encoding:
	input_ = input_.encode(encoding)
	input_file.write(input_)
	input_file.flush()

	# Run the tagger and get the output.
	if self._USE_STDIN:
	input_file.seek(0)
	stdout, stderr = java(
	cmd,
	classpath=self._classpath,
	stdin=input_file,
	stdout=PIPE,
	stderr=PIPE,
	)
	else:
	cmd.append(input_file.name)
	stdout, stderr = java(
	cmd, classpath=self._classpath, stdout=PIPE, stderr=PIPE
	)

	stdout = stdout.replace(b"\xc2\xa0", b" ")
	stdout = stdout.replace(b"\x00\xa0", b" ")
	stdout = stdout.decode(encoding)

	os.unlink(input_file.name)

	# Return java configurations to their default values.
	config_java(options=default_options, verbose=False)

	return stdout


	class StanfordParser(GenericStanfordParser):
	"""
	>>> parser=StanfordParser(
	... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
	... ) # doctest: +SKIP

	>>> list(parser.raw_parse("the quick brown fox jumps over the lazy dog")) # doctest: +NORMALIZE_WHITESPACE +SKIP
	[Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
	Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
	Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])])]

	>>> sum([list(dep_graphs) for dep_graphs in parser.raw_parse_sents((
	... "the quick brown fox jumps over the lazy dog",
	... "the quick grey wolf jumps over the lazy fox"
	... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
	[Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
	Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']),
	Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])]), Tree('ROOT', [Tree('NP',
	[Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['grey']), Tree('NN', ['wolf'])]), Tree('NP',
	[Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']),
	Tree('JJ', ['lazy']), Tree('NN', ['fox'])])])])])])]

	>>> sum([list(dep_graphs) for dep_graphs in parser.parse_sents((
	... "I 'm a dog".split(),
	... "This is my friends ' cat ( the tabby )".split(),
	... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
	[Tree('ROOT', [Tree('S', [Tree('NP', [Tree('PRP', ['I'])]), Tree('VP', [Tree('VBP', ["'m"]),
	Tree('NP', [Tree('DT', ['a']), Tree('NN', ['dog'])])])])]), Tree('ROOT', [Tree('S', [Tree('NP',
	[Tree('DT', ['This'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('NP', [Tree('PRP$', ['my']),
	Tree('NNS', ['friends']), Tree('POS', ["'"])]), Tree('NN', ['cat'])]), Tree('PRN', [Tree('-LRB-', [Tree('', []),
	Tree('NP', [Tree('DT', ['the']), Tree('NN', ['tabby'])]), Tree('-RRB-', [])])])])])])])]

	>>> sum([list(dep_graphs) for dep_graphs in parser.tagged_parse_sents((
	... (
	... ("The", "DT"),
	... ("quick", "JJ"),
	... ("brown", "JJ"),
	... ("fox", "NN"),
	... ("jumped", "VBD"),
	... ("over", "IN"),
	... ("the", "DT"),
	... ("lazy", "JJ"),
	... ("dog", "NN"),
	... (".", "."),
	... ),
	... ))],[]) # doctest: +NORMALIZE_WHITESPACE +SKIP
	[Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['The']), Tree('JJ', ['quick']), Tree('JJ', ['brown']),
	Tree('NN', ['fox'])]), Tree('VP', [Tree('VBD', ['jumped']), Tree('PP', [Tree('IN', ['over']), Tree('NP',
	[Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])]
	"""

	_OUTPUT_FORMAT = "penn"

	def __init__(self, args, *kwargs):
	warnings.warn(
	"The StanfordParser will be deprecated\n"
	"Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead.",
	DeprecationWarning,
	stacklevel=2,
	)

	super().__init__(args, *kwargs)

	def _make_tree(self, result):
	return Tree.fromstring(result)


	class StanfordDependencyParser(GenericStanfordParser):

	"""
	>>> dep_parser=StanfordDependencyParser(
	... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
	... ) # doctest: +SKIP

	>>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
	[Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])])]

	>>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
	[[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),
	((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),
	((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),
	((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]

	>>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents((
	... "The quick brown fox jumps over the lazy dog.",
	... "The quick grey wolf jumps over the lazy fox."
	... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
	[Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])]),
	Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), Tree('fox', ['over', 'the', 'lazy'])])]

	>>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents((
	... "I 'm a dog".split(),
	... "This is my friends ' cat ( the tabby )".split(),
	... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
	[Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', ['my', "'"]), Tree('tabby', ['the'])])]

	>>> sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents((
	... (
	... ("The", "DT"),
	... ("quick", "JJ"),
	... ("brown", "JJ"),
	... ("fox", "NN"),
	... ("jumped", "VBD"),
	... ("over", "IN"),
	... ("the", "DT"),
	... ("lazy", "JJ"),
	... ("dog", "NN"),
	... (".", "."),
	... ),
	... ))],[]) # doctest: +NORMALIZE_WHITESPACE +SKIP
	[[((u'jumped', u'VBD'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')),
	((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')),
	((u'jumped', u'VBD'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')),
	((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]]

	"""

	_OUTPUT_FORMAT = "conll2007"

	def __init__(self, args, *kwargs):
	warnings.warn(
	"The StanfordDependencyParser will be deprecated\n"
	"Please use \033[91mnltk.parse.corenlp.CoreNLPDependencyParser\033[0m instead.",
	DeprecationWarning,
	stacklevel=2,
	)

	super().__init__(args, *kwargs)

	def _make_tree(self, result):
	return DependencyGraph(result, top_relation_label="root")


	class StanfordNeuralDependencyParser(GenericStanfordParser):
	"""
	>>> from nltk.parse.stanford import StanfordNeuralDependencyParser # doctest: +SKIP
	>>> dep_parser=StanfordNeuralDependencyParser(java_options='-mx4g')# doctest: +SKIP

	>>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
	[Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy']), '.'])]

	>>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP
	[[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det',
	(u'The', u'DT')), ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'),
	u'amod', (u'brown', u'JJ')), ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')),
	((u'dog', u'NN'), u'case', (u'over', u'IN')), ((u'dog', u'NN'), u'det',
	(u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ')), ((u'jumps', u'VBZ'),
	u'punct', (u'.', u'.'))]]

	>>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents((
	... "The quick brown fox jumps over the lazy dog.",
	... "The quick grey wolf jumps over the lazy fox."
	... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
	[Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over',
	'the', 'lazy']), '.']), Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']),
	Tree('fox', ['over', 'the', 'lazy']), '.'])]

	>>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents((
	... "I 'm a dog".split(),
	... "This is my friends ' cat ( the tabby )".split(),
	... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP
	[Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends',
	['my', "'"]), Tree('tabby', ['-LRB-', 'the', '-RRB-'])])]
	"""

	_OUTPUT_FORMAT = "conll"
	_MAIN_CLASS = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
	_JAR = r"stanford-corenlp-(\d+)(\.(\d+))+\.jar"
	_MODEL_JAR_PATTERN = r"stanford-corenlp-(\d+)(\.(\d+))+-models\.jar"
	_USE_STDIN = True
	_DOUBLE_SPACED_OUTPUT = True

	def __init__(self, args, *kwargs):
	warnings.warn(
	"The StanfordNeuralDependencyParser will be deprecated\n"
	"Please use \033[91mnltk.parse.corenlp.CoreNLPDependencyParser\033[0m instead.",
	DeprecationWarning,
	stacklevel=2,
	)

	super().__init__(args, *kwargs)
	self.corenlp_options += "-annotators tokenize,ssplit,pos,depparse"

	def tagged_parse_sents(self, sentences, verbose=False):
	"""
	Currently unimplemented because the neural dependency parser (and
	the StanfordCoreNLP pipeline class) doesn't support passing in pre-
	tagged tokens.
	"""
	raise NotImplementedError(
	"tagged_parse[_sents] is not supported by "
	"StanfordNeuralDependencyParser; use "
	"parse[_sents] or raw_parse[_sents] instead."
	)

	def _make_tree(self, result):
	return DependencyGraph(result, top_relation_label="ROOT")