Spaces:
Sleeping
Sleeping
# Natural Language Toolkit: Interface to MaltParser | |
# | |
# Author: Dan Garrette <[email protected]> | |
# Contributor: Liling Tan, Mustufain, osamamukhtar11 | |
# | |
# Copyright (C) 2001-2023 NLTK Project | |
# URL: <https://www.nltk.org/> | |
# For license information, see LICENSE.TXT | |
import inspect | |
import os | |
import subprocess | |
import sys | |
import tempfile | |
from nltk.data import ZipFilePathPointer | |
from nltk.internals import find_dir, find_file, find_jars_within_path | |
from nltk.parse.api import ParserI | |
from nltk.parse.dependencygraph import DependencyGraph | |
from nltk.parse.util import taggedsents_to_conll | |
def malt_regex_tagger(): | |
from nltk.tag import RegexpTagger | |
_tagger = RegexpTagger( | |
[ | |
(r"\.$", "."), | |
(r"\,$", ","), | |
(r"\?$", "?"), # fullstop, comma, Qmark | |
(r"\($", "("), | |
(r"\)$", ")"), # round brackets | |
(r"\[$", "["), | |
(r"\]$", "]"), # square brackets | |
(r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers | |
(r"(The|the|A|a|An|an)$", "DT"), # articles | |
(r"(He|he|She|she|It|it|I|me|Me|You|you)$", "PRP"), # pronouns | |
(r"(His|his|Her|her|Its|its)$", "PRP$"), # possessive | |
(r"(my|Your|your|Yours|yours)$", "PRP$"), # possessive | |
(r"(on|On|in|In|at|At|since|Since)$", "IN"), # time prepopsitions | |
(r"(for|For|ago|Ago|before|Before)$", "IN"), # time prepopsitions | |
(r"(till|Till|until|Until)$", "IN"), # time prepopsitions | |
(r"(by|By|beside|Beside)$", "IN"), # space prepopsitions | |
(r"(under|Under|below|Below)$", "IN"), # space prepopsitions | |
(r"(over|Over|above|Above)$", "IN"), # space prepopsitions | |
(r"(across|Across|through|Through)$", "IN"), # space prepopsitions | |
(r"(into|Into|towards|Towards)$", "IN"), # space prepopsitions | |
(r"(onto|Onto|from|From)$", "IN"), # space prepopsitions | |
(r".*able$", "JJ"), # adjectives | |
(r".*ness$", "NN"), # nouns formed from adjectives | |
(r".*ly$", "RB"), # adverbs | |
(r".*s$", "NNS"), # plural nouns | |
(r".*ing$", "VBG"), # gerunds | |
(r".*ed$", "VBD"), # past tense verbs | |
(r".*", "NN"), # nouns (default) | |
] | |
) | |
return _tagger.tag | |
def find_maltparser(parser_dirname): | |
""" | |
A module to find MaltParser .jar file and its dependencies. | |
""" | |
if os.path.exists(parser_dirname): # If a full path is given. | |
_malt_dir = parser_dirname | |
else: # Try to find path to maltparser directory in environment variables. | |
_malt_dir = find_dir(parser_dirname, env_vars=("MALT_PARSER",)) | |
# Checks that that the found directory contains all the necessary .jar | |
malt_dependencies = ["", "", ""] | |
_malt_jars = set(find_jars_within_path(_malt_dir)) | |
_jars = {os.path.split(jar)[1] for jar in _malt_jars} | |
malt_dependencies = {"log4j.jar", "libsvm.jar", "liblinear-1.8.jar"} | |
assert malt_dependencies.issubset(_jars) | |
assert any( | |
filter(lambda i: i.startswith("maltparser-") and i.endswith(".jar"), _jars) | |
) | |
return list(_malt_jars) | |
def find_malt_model(model_filename): | |
""" | |
A module to find pre-trained MaltParser model. | |
""" | |
if model_filename is None: | |
return "malt_temp.mco" | |
elif os.path.exists(model_filename): # If a full path is given. | |
return model_filename | |
else: # Try to find path to malt model in environment variables. | |
return find_file(model_filename, env_vars=("MALT_MODEL",), verbose=False) | |
class MaltParser(ParserI): | |
""" | |
A class for dependency parsing with MaltParser. The input is the paths to: | |
- (optionally) a maltparser directory | |
- (optionally) the path to a pre-trained MaltParser .mco model file | |
- (optionally) the tagger to use for POS tagging before parsing | |
- (optionally) additional Java arguments | |
Example: | |
>>> from nltk.parse import malt | |
>>> # With MALT_PARSER and MALT_MODEL environment set. | |
>>> mp = malt.MaltParser(model_filename='engmalt.linear-1.7.mco') # doctest: +SKIP | |
>>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP | |
(shot I (elephant an) (in (pajamas my)) .) | |
>>> # Without MALT_PARSER and MALT_MODEL environment. | |
>>> mp = malt.MaltParser('/home/user/maltparser-1.9.2/', '/home/user/engmalt.linear-1.7.mco') # doctest: +SKIP | |
>>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP | |
(shot I (elephant an) (in (pajamas my)) .) | |
""" | |
def __init__( | |
self, | |
parser_dirname="", | |
model_filename=None, | |
tagger=None, | |
additional_java_args=None, | |
): | |
""" | |
An interface for parsing with the Malt Parser. | |
:param parser_dirname: The path to the maltparser directory that | |
contains the maltparser-1.x.jar | |
:type parser_dirname: str | |
:param model_filename: The name of the pre-trained model with .mco file | |
extension. If provided, training will not be required. | |
(see http://www.maltparser.org/mco/mco.html and | |
see http://www.patful.com/chalk/node/185) | |
:type model_filename: str | |
:param tagger: The tagger used to POS tag the raw string before | |
formatting to CONLL format. It should behave like `nltk.pos_tag` | |
:type tagger: function | |
:param additional_java_args: This is the additional Java arguments that | |
one can use when calling Maltparser, usually this is the heapsize | |
limits, e.g. `additional_java_args=['-Xmx1024m']` | |
(see https://goo.gl/mpDBvQ) | |
:type additional_java_args: list | |
""" | |
# Find all the necessary jar files for MaltParser. | |
self.malt_jars = find_maltparser(parser_dirname) | |
# Initialize additional java arguments. | |
self.additional_java_args = ( | |
additional_java_args if additional_java_args is not None else [] | |
) | |
# Initialize model. | |
self.model = find_malt_model(model_filename) | |
self._trained = self.model != "malt_temp.mco" | |
# Set the working_dir parameters i.e. `-w` from MaltParser's option. | |
self.working_dir = tempfile.gettempdir() | |
# Initialize POS tagger. | |
self.tagger = tagger if tagger is not None else malt_regex_tagger() | |
def parse_tagged_sents(self, sentences, verbose=False, top_relation_label="null"): | |
""" | |
Use MaltParser to parse multiple POS tagged sentences. Takes multiple | |
sentences where each sentence is a list of (word, tag) tuples. | |
The sentences must have already been tokenized and tagged. | |
:param sentences: Input sentences to parse | |
:type sentence: list(list(tuple(str, str))) | |
:return: iter(iter(``DependencyGraph``)) the dependency graph | |
representation of each sentence | |
""" | |
if not self._trained: | |
raise Exception("Parser has not been trained. Call train() first.") | |
with tempfile.NamedTemporaryFile( | |
prefix="malt_input.conll.", dir=self.working_dir, mode="w", delete=False | |
) as input_file: | |
with tempfile.NamedTemporaryFile( | |
prefix="malt_output.conll.", | |
dir=self.working_dir, | |
mode="w", | |
delete=False, | |
) as output_file: | |
# Convert list of sentences to CONLL format. | |
for line in taggedsents_to_conll(sentences): | |
input_file.write(str(line)) | |
input_file.close() | |
# Generate command to run maltparser. | |
cmd = self.generate_malt_command( | |
input_file.name, output_file.name, mode="parse" | |
) | |
# This is a maltparser quirk, it needs to be run | |
# where the model file is. otherwise it goes into an awkward | |
# missing .jars or strange -w working_dir problem. | |
_current_path = os.getcwd() # Remembers the current path. | |
try: # Change to modelfile path | |
os.chdir(os.path.split(self.model)[0]) | |
except: | |
pass | |
ret = self._execute(cmd, verbose) # Run command. | |
os.chdir(_current_path) # Change back to current path. | |
if ret != 0: | |
raise Exception( | |
"MaltParser parsing (%s) failed with exit " | |
"code %d" % (" ".join(cmd), ret) | |
) | |
# Must return iter(iter(Tree)) | |
with open(output_file.name) as infile: | |
for tree_str in infile.read().split("\n\n"): | |
yield ( | |
iter( | |
[ | |
DependencyGraph( | |
tree_str, top_relation_label=top_relation_label | |
) | |
] | |
) | |
) | |
os.remove(input_file.name) | |
os.remove(output_file.name) | |
def parse_sents(self, sentences, verbose=False, top_relation_label="null"): | |
""" | |
Use MaltParser to parse multiple sentences. | |
Takes a list of sentences, where each sentence is a list of words. | |
Each sentence will be automatically tagged with this | |
MaltParser instance's tagger. | |
:param sentences: Input sentences to parse | |
:type sentence: list(list(str)) | |
:return: iter(DependencyGraph) | |
""" | |
tagged_sentences = (self.tagger(sentence) for sentence in sentences) | |
return self.parse_tagged_sents( | |
tagged_sentences, verbose, top_relation_label=top_relation_label | |
) | |
def generate_malt_command(self, inputfilename, outputfilename=None, mode=None): | |
""" | |
This function generates the maltparser command use at the terminal. | |
:param inputfilename: path to the input file | |
:type inputfilename: str | |
:param outputfilename: path to the output file | |
:type outputfilename: str | |
""" | |
cmd = ["java"] | |
cmd += self.additional_java_args # Adds additional java arguments | |
# Joins classpaths with ";" if on Windows and on Linux/Mac use ":" | |
classpaths_separator = ";" if sys.platform.startswith("win") else ":" | |
cmd += [ | |
"-cp", | |
classpaths_separator.join(self.malt_jars), | |
] # Adds classpaths for jars | |
cmd += ["org.maltparser.Malt"] # Adds the main function. | |
# Adds the model file. | |
if os.path.exists(self.model): # when parsing | |
cmd += ["-c", os.path.split(self.model)[-1]] | |
else: # when learning | |
cmd += ["-c", self.model] | |
cmd += ["-i", inputfilename] | |
if mode == "parse": | |
cmd += ["-o", outputfilename] | |
cmd += ["-m", mode] # mode use to generate parses. | |
return cmd | |
def _execute(cmd, verbose=False): | |
output = None if verbose else subprocess.PIPE | |
p = subprocess.Popen(cmd, stdout=output, stderr=output) | |
return p.wait() | |
def train(self, depgraphs, verbose=False): | |
""" | |
Train MaltParser from a list of ``DependencyGraph`` objects | |
:param depgraphs: list of ``DependencyGraph`` objects for training input data | |
:type depgraphs: DependencyGraph | |
""" | |
# Write the conll_str to malt_train.conll file in /tmp/ | |
with tempfile.NamedTemporaryFile( | |
prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False | |
) as input_file: | |
input_str = "\n".join(dg.to_conll(10) for dg in depgraphs) | |
input_file.write(str(input_str)) | |
# Trains the model with the malt_train.conll | |
self.train_from_file(input_file.name, verbose=verbose) | |
# Removes the malt_train.conll once training finishes. | |
os.remove(input_file.name) | |
def train_from_file(self, conll_file, verbose=False): | |
""" | |
Train MaltParser from a file | |
:param conll_file: str for the filename of the training input data | |
:type conll_file: str | |
""" | |
# If conll_file is a ZipFilePathPointer, | |
# then we need to do some extra massaging | |
if isinstance(conll_file, ZipFilePathPointer): | |
with tempfile.NamedTemporaryFile( | |
prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False | |
) as input_file: | |
with conll_file.open() as conll_input_file: | |
conll_str = conll_input_file.read() | |
input_file.write(str(conll_str)) | |
return self.train_from_file(input_file.name, verbose=verbose) | |
# Generate command to run maltparser. | |
cmd = self.generate_malt_command(conll_file, mode="learn") | |
ret = self._execute(cmd, verbose) | |
if ret != 0: | |
raise Exception( | |
"MaltParser training (%s) failed with exit " | |
"code %d" % (" ".join(cmd), ret) | |
) | |
self._trained = True | |
if __name__ == "__main__": | |
""" | |
A demonstration function to show how NLTK users can use the malt parser API. | |
>>> from nltk import pos_tag | |
>>> assert 'MALT_PARSER' in os.environ, str( | |
... "Please set MALT_PARSER in your global environment, e.g.:\n" | |
... "$ export MALT_PARSER='/home/user/maltparser-1.9.2/'") | |
>>> | |
>>> assert 'MALT_MODEL' in os.environ, str( | |
... "Please set MALT_MODEL in your global environment, e.g.:\n" | |
... "$ export MALT_MODEL='/home/user/engmalt.linear-1.7.mco'") | |
>>> | |
>>> _dg1_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n" | |
... "2 sees _ VB _ _ 0 ROOT _ _\n" | |
... "3 a _ DT _ _ 4 SPEC _ _\n" | |
... "4 dog _ NN _ _ 2 OBJ _ _\n" | |
... "5 . _ . _ _ 2 PUNCT _ _\n") | |
>>> | |
>>> | |
>>> _dg2_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n" | |
... "2 walks _ VB _ _ 0 ROOT _ _\n" | |
... "3 . _ . _ _ 2 PUNCT _ _\n") | |
>>> dg1 = DependencyGraph(_dg1_str) | |
>>> dg2 = DependencyGraph(_dg2_str) | |
>>> # Initialize a MaltParser object | |
>>> mp = MaltParser() | |
>>> | |
>>> # Trains a model. | |
>>> mp.train([dg1,dg2], verbose=False) | |
>>> sent1 = ['John','sees','Mary', '.'] | |
>>> sent2 = ['John', 'walks', 'a', 'dog', '.'] | |
>>> | |
>>> # Parse a single sentence. | |
>>> parsed_sent1 = mp.parse_one(sent1) | |
>>> parsed_sent2 = mp.parse_one(sent2) | |
>>> print(parsed_sent1.tree()) | |
(sees John Mary .) | |
>>> print(parsed_sent2.tree()) | |
(walks John (dog a) .) | |
>>> | |
>>> # Parsing multiple sentences. | |
>>> sentences = [sent1,sent2] | |
>>> parsed_sents = mp.parse_sents(sentences) | |
>>> print(next(next(parsed_sents)).tree()) | |
(sees John Mary .) | |
>>> print(next(next(parsed_sents)).tree()) | |
(walks John (dog a) .) | |
>>> | |
>>> # Initialize a MaltParser object with an English pre-trained model. | |
>>> parser_dirname = 'maltparser-1.9.2' | |
>>> model_name = 'engmalt.linear-1.7.mco' | |
>>> mp = MaltParser(parser_dirname=parser_dirname, model_filename=model_name, tagger=pos_tag) | |
>>> sent1 = 'I shot an elephant in my pajamas .'.split() | |
>>> sent2 = 'Time flies like banana .'.split() | |
>>> # Parse a single sentence. | |
>>> print(mp.parse_one(sent1).tree()) | |
(shot I (elephant an) (in (pajamas my)) .) | |
# Parsing multiple sentences | |
>>> sentences = [sent1,sent2] | |
>>> parsed_sents = mp.parse_sents(sentences) | |
>>> print(next(next(parsed_sents)).tree()) | |
(shot I (elephant an) (in (pajamas my)) .) | |
>>> print(next(next(parsed_sents)).tree()) | |
(flies Time (like banana) .) | |
""" | |
import doctest | |
doctest.testmod() | |