Spaces:
Sleeping
Sleeping
# Natural Language Toolkit: Chart Parser for Feature-Based Grammars | |
# | |
# Copyright (C) 2001-2023 NLTK Project | |
# Author: Rob Speer <[email protected]> | |
# Peter Ljunglöf <[email protected]> | |
# URL: <https://www.nltk.org/> | |
# For license information, see LICENSE.TXT | |
""" | |
Extension of chart parsing implementation to handle grammars with | |
feature structures as nodes. | |
""" | |
from time import perf_counter | |
from nltk.featstruct import TYPE, FeatStruct, find_variables, unify | |
from nltk.grammar import ( | |
CFG, | |
FeatStructNonterminal, | |
Nonterminal, | |
Production, | |
is_nonterminal, | |
is_terminal, | |
) | |
from nltk.parse.chart import ( | |
BottomUpPredictCombineRule, | |
BottomUpPredictRule, | |
CachedTopDownPredictRule, | |
Chart, | |
ChartParser, | |
EdgeI, | |
EmptyPredictRule, | |
FundamentalRule, | |
LeafInitRule, | |
SingleEdgeFundamentalRule, | |
TopDownInitRule, | |
TreeEdge, | |
) | |
from nltk.sem import logic | |
from nltk.tree import Tree | |
# //////////////////////////////////////////////////////////// | |
# Tree Edge | |
# //////////////////////////////////////////////////////////// | |
class FeatureTreeEdge(TreeEdge): | |
""" | |
A specialized tree edge that allows shared variable bindings | |
between nonterminals on the left-hand side and right-hand side. | |
Each ``FeatureTreeEdge`` contains a set of ``bindings``, i.e., a | |
dictionary mapping from variables to values. If the edge is not | |
complete, then these bindings are simply stored. However, if the | |
edge is complete, then the constructor applies these bindings to | |
every nonterminal in the edge whose symbol implements the | |
interface ``SubstituteBindingsI``. | |
""" | |
def __init__(self, span, lhs, rhs, dot=0, bindings=None): | |
""" | |
Construct a new edge. If the edge is incomplete (i.e., if | |
``dot<len(rhs)``), then store the bindings as-is. If the edge | |
is complete (i.e., if ``dot==len(rhs)``), then apply the | |
bindings to all nonterminals in ``lhs`` and ``rhs``, and then | |
clear the bindings. See ``TreeEdge`` for a description of | |
the other arguments. | |
""" | |
if bindings is None: | |
bindings = {} | |
# If the edge is complete, then substitute in the bindings, | |
# and then throw them away. (If we didn't throw them away, we | |
# might think that 2 complete edges are different just because | |
# they have different bindings, even though all bindings have | |
# already been applied.) | |
if dot == len(rhs) and bindings: | |
lhs = self._bind(lhs, bindings) | |
rhs = [self._bind(elt, bindings) for elt in rhs] | |
bindings = {} | |
# Initialize the edge. | |
TreeEdge.__init__(self, span, lhs, rhs, dot) | |
self._bindings = bindings | |
self._comparison_key = (self._comparison_key, tuple(sorted(bindings.items()))) | |
def from_production(production, index): | |
""" | |
:return: A new ``TreeEdge`` formed from the given production. | |
The new edge's left-hand side and right-hand side will | |
be taken from ``production``; its span will be | |
``(index,index)``; and its dot position will be ``0``. | |
:rtype: TreeEdge | |
""" | |
return FeatureTreeEdge( | |
span=(index, index), lhs=production.lhs(), rhs=production.rhs(), dot=0 | |
) | |
def move_dot_forward(self, new_end, bindings=None): | |
""" | |
:return: A new ``FeatureTreeEdge`` formed from this edge. | |
The new edge's dot position is increased by ``1``, | |
and its end index will be replaced by ``new_end``. | |
:rtype: FeatureTreeEdge | |
:param new_end: The new end index. | |
:type new_end: int | |
:param bindings: Bindings for the new edge. | |
:type bindings: dict | |
""" | |
return FeatureTreeEdge( | |
span=(self._span[0], new_end), | |
lhs=self._lhs, | |
rhs=self._rhs, | |
dot=self._dot + 1, | |
bindings=bindings, | |
) | |
def _bind(self, nt, bindings): | |
if not isinstance(nt, FeatStructNonterminal): | |
return nt | |
return nt.substitute_bindings(bindings) | |
def next_with_bindings(self): | |
return self._bind(self.nextsym(), self._bindings) | |
def bindings(self): | |
""" | |
Return a copy of this edge's bindings dictionary. | |
""" | |
return self._bindings.copy() | |
def variables(self): | |
""" | |
:return: The set of variables used by this edge. | |
:rtype: set(Variable) | |
""" | |
return find_variables( | |
[self._lhs] | |
+ list(self._rhs) | |
+ list(self._bindings.keys()) | |
+ list(self._bindings.values()), | |
fs_class=FeatStruct, | |
) | |
def __str__(self): | |
if self.is_complete(): | |
return super().__str__() | |
else: | |
bindings = "{%s}" % ", ".join( | |
"%s: %r" % item for item in sorted(self._bindings.items()) | |
) | |
return f"{super().__str__()} {bindings}" | |
# //////////////////////////////////////////////////////////// | |
# A specialized Chart for feature grammars | |
# //////////////////////////////////////////////////////////// | |
# TODO: subsumes check when adding new edges | |
class FeatureChart(Chart): | |
""" | |
A Chart for feature grammars. | |
:see: ``Chart`` for more information. | |
""" | |
def select(self, **restrictions): | |
""" | |
Returns an iterator over the edges in this chart. | |
See ``Chart.select`` for more information about the | |
``restrictions`` on the edges. | |
""" | |
# If there are no restrictions, then return all edges. | |
if restrictions == {}: | |
return iter(self._edges) | |
# Find the index corresponding to the given restrictions. | |
restr_keys = sorted(restrictions.keys()) | |
restr_keys = tuple(restr_keys) | |
# If it doesn't exist, then create it. | |
if restr_keys not in self._indexes: | |
self._add_index(restr_keys) | |
vals = tuple( | |
self._get_type_if_possible(restrictions[key]) for key in restr_keys | |
) | |
return iter(self._indexes[restr_keys].get(vals, [])) | |
def _add_index(self, restr_keys): | |
""" | |
A helper function for ``select``, which creates a new index for | |
a given set of attributes (aka restriction keys). | |
""" | |
# Make sure it's a valid index. | |
for key in restr_keys: | |
if not hasattr(EdgeI, key): | |
raise ValueError("Bad restriction: %s" % key) | |
# Create the index. | |
index = self._indexes[restr_keys] = {} | |
# Add all existing edges to the index. | |
for edge in self._edges: | |
vals = tuple( | |
self._get_type_if_possible(getattr(edge, key)()) for key in restr_keys | |
) | |
index.setdefault(vals, []).append(edge) | |
def _register_with_indexes(self, edge): | |
""" | |
A helper function for ``insert``, which registers the new | |
edge with all existing indexes. | |
""" | |
for (restr_keys, index) in self._indexes.items(): | |
vals = tuple( | |
self._get_type_if_possible(getattr(edge, key)()) for key in restr_keys | |
) | |
index.setdefault(vals, []).append(edge) | |
def _get_type_if_possible(self, item): | |
""" | |
Helper function which returns the ``TYPE`` feature of the ``item``, | |
if it exists, otherwise it returns the ``item`` itself | |
""" | |
if isinstance(item, dict) and TYPE in item: | |
return item[TYPE] | |
else: | |
return item | |
def parses(self, start, tree_class=Tree): | |
for edge in self.select(start=0, end=self._num_leaves): | |
if ( | |
(isinstance(edge, FeatureTreeEdge)) | |
and (edge.lhs()[TYPE] == start[TYPE]) | |
and (unify(edge.lhs(), start, rename_vars=True)) | |
): | |
yield from self.trees(edge, complete=True, tree_class=tree_class) | |
# //////////////////////////////////////////////////////////// | |
# Fundamental Rule | |
# //////////////////////////////////////////////////////////// | |
class FeatureFundamentalRule(FundamentalRule): | |
r""" | |
A specialized version of the fundamental rule that operates on | |
nonterminals whose symbols are ``FeatStructNonterminal``s. Rather | |
than simply comparing the nonterminals for equality, they are | |
unified. Variable bindings from these unifications are collected | |
and stored in the chart using a ``FeatureTreeEdge``. When a | |
complete edge is generated, these bindings are applied to all | |
nonterminals in the edge. | |
The fundamental rule states that: | |
- ``[A -> alpha \* B1 beta][i:j]`` | |
- ``[B2 -> gamma \*][j:k]`` | |
licenses the edge: | |
- ``[A -> alpha B3 \* beta][i:j]`` | |
assuming that B1 and B2 can be unified to generate B3. | |
""" | |
def apply(self, chart, grammar, left_edge, right_edge): | |
# Make sure the rule is applicable. | |
if not ( | |
left_edge.end() == right_edge.start() | |
and left_edge.is_incomplete() | |
and right_edge.is_complete() | |
and isinstance(left_edge, FeatureTreeEdge) | |
): | |
return | |
found = right_edge.lhs() | |
nextsym = left_edge.nextsym() | |
if isinstance(right_edge, FeatureTreeEdge): | |
if not is_nonterminal(nextsym): | |
return | |
if left_edge.nextsym()[TYPE] != right_edge.lhs()[TYPE]: | |
return | |
# Create a copy of the bindings. | |
bindings = left_edge.bindings() | |
# We rename vars here, because we don't want variables | |
# from the two different productions to match. | |
found = found.rename_variables(used_vars=left_edge.variables()) | |
# Unify B1 (left_edge.nextsym) with B2 (right_edge.lhs) to | |
# generate B3 (result). | |
result = unify(nextsym, found, bindings, rename_vars=False) | |
if result is None: | |
return | |
else: | |
if nextsym != found: | |
return | |
# Create a copy of the bindings. | |
bindings = left_edge.bindings() | |
# Construct the new edge. | |
new_edge = left_edge.move_dot_forward(right_edge.end(), bindings) | |
# Add it to the chart, with appropriate child pointers. | |
if chart.insert_with_backpointer(new_edge, left_edge, right_edge): | |
yield new_edge | |
class FeatureSingleEdgeFundamentalRule(SingleEdgeFundamentalRule): | |
""" | |
A specialized version of the completer / single edge fundamental rule | |
that operates on nonterminals whose symbols are ``FeatStructNonterminal``. | |
Rather than simply comparing the nonterminals for equality, they are | |
unified. | |
""" | |
_fundamental_rule = FeatureFundamentalRule() | |
def _apply_complete(self, chart, grammar, right_edge): | |
fr = self._fundamental_rule | |
for left_edge in chart.select( | |
end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs() | |
): | |
yield from fr.apply(chart, grammar, left_edge, right_edge) | |
def _apply_incomplete(self, chart, grammar, left_edge): | |
fr = self._fundamental_rule | |
for right_edge in chart.select( | |
start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym() | |
): | |
yield from fr.apply(chart, grammar, left_edge, right_edge) | |
# //////////////////////////////////////////////////////////// | |
# Top-Down Prediction | |
# //////////////////////////////////////////////////////////// | |
class FeatureTopDownInitRule(TopDownInitRule): | |
def apply(self, chart, grammar): | |
for prod in grammar.productions(lhs=grammar.start()): | |
new_edge = FeatureTreeEdge.from_production(prod, 0) | |
if chart.insert(new_edge, ()): | |
yield new_edge | |
class FeatureTopDownPredictRule(CachedTopDownPredictRule): | |
r""" | |
A specialized version of the (cached) top down predict rule that operates | |
on nonterminals whose symbols are ``FeatStructNonterminal``. Rather | |
than simply comparing the nonterminals for equality, they are | |
unified. | |
The top down expand rule states that: | |
- ``[A -> alpha \* B1 beta][i:j]`` | |
licenses the edge: | |
- ``[B2 -> \* gamma][j:j]`` | |
for each grammar production ``B2 -> gamma``, assuming that B1 | |
and B2 can be unified. | |
""" | |
def apply(self, chart, grammar, edge): | |
if edge.is_complete(): | |
return | |
nextsym, index = edge.nextsym(), edge.end() | |
if not is_nonterminal(nextsym): | |
return | |
# If we've already applied this rule to an edge with the same | |
# next & end, and the chart & grammar have not changed, then | |
# just return (no new edges to add). | |
nextsym_with_bindings = edge.next_with_bindings() | |
done = self._done.get((nextsym_with_bindings, index), (None, None)) | |
if done[0] is chart and done[1] is grammar: | |
return | |
for prod in grammar.productions(lhs=nextsym): | |
# If the left corner in the predicted production is | |
# leaf, it must match with the input. | |
if prod.rhs(): | |
first = prod.rhs()[0] | |
if is_terminal(first): | |
if index >= chart.num_leaves(): | |
continue | |
if first != chart.leaf(index): | |
continue | |
# We rename vars here, because we don't want variables | |
# from the two different productions to match. | |
if unify(prod.lhs(), nextsym_with_bindings, rename_vars=True): | |
new_edge = FeatureTreeEdge.from_production(prod, edge.end()) | |
if chart.insert(new_edge, ()): | |
yield new_edge | |
# Record the fact that we've applied this rule. | |
self._done[nextsym_with_bindings, index] = (chart, grammar) | |
# //////////////////////////////////////////////////////////// | |
# Bottom-Up Prediction | |
# //////////////////////////////////////////////////////////// | |
class FeatureBottomUpPredictRule(BottomUpPredictRule): | |
def apply(self, chart, grammar, edge): | |
if edge.is_incomplete(): | |
return | |
for prod in grammar.productions(rhs=edge.lhs()): | |
if isinstance(edge, FeatureTreeEdge): | |
_next = prod.rhs()[0] | |
if not is_nonterminal(_next): | |
continue | |
new_edge = FeatureTreeEdge.from_production(prod, edge.start()) | |
if chart.insert(new_edge, ()): | |
yield new_edge | |
class FeatureBottomUpPredictCombineRule(BottomUpPredictCombineRule): | |
def apply(self, chart, grammar, edge): | |
if edge.is_incomplete(): | |
return | |
found = edge.lhs() | |
for prod in grammar.productions(rhs=found): | |
bindings = {} | |
if isinstance(edge, FeatureTreeEdge): | |
_next = prod.rhs()[0] | |
if not is_nonterminal(_next): | |
continue | |
# We rename vars here, because we don't want variables | |
# from the two different productions to match. | |
used_vars = find_variables( | |
(prod.lhs(),) + prod.rhs(), fs_class=FeatStruct | |
) | |
found = found.rename_variables(used_vars=used_vars) | |
result = unify(_next, found, bindings, rename_vars=False) | |
if result is None: | |
continue | |
new_edge = FeatureTreeEdge.from_production( | |
prod, edge.start() | |
).move_dot_forward(edge.end(), bindings) | |
if chart.insert(new_edge, (edge,)): | |
yield new_edge | |
class FeatureEmptyPredictRule(EmptyPredictRule): | |
def apply(self, chart, grammar): | |
for prod in grammar.productions(empty=True): | |
for index in range(chart.num_leaves() + 1): | |
new_edge = FeatureTreeEdge.from_production(prod, index) | |
if chart.insert(new_edge, ()): | |
yield new_edge | |
# //////////////////////////////////////////////////////////// | |
# Feature Chart Parser | |
# //////////////////////////////////////////////////////////// | |
TD_FEATURE_STRATEGY = [ | |
LeafInitRule(), | |
FeatureTopDownInitRule(), | |
FeatureTopDownPredictRule(), | |
FeatureSingleEdgeFundamentalRule(), | |
] | |
BU_FEATURE_STRATEGY = [ | |
LeafInitRule(), | |
FeatureEmptyPredictRule(), | |
FeatureBottomUpPredictRule(), | |
FeatureSingleEdgeFundamentalRule(), | |
] | |
BU_LC_FEATURE_STRATEGY = [ | |
LeafInitRule(), | |
FeatureEmptyPredictRule(), | |
FeatureBottomUpPredictCombineRule(), | |
FeatureSingleEdgeFundamentalRule(), | |
] | |
class FeatureChartParser(ChartParser): | |
def __init__( | |
self, | |
grammar, | |
strategy=BU_LC_FEATURE_STRATEGY, | |
trace_chart_width=20, | |
chart_class=FeatureChart, | |
**parser_args, | |
): | |
ChartParser.__init__( | |
self, | |
grammar, | |
strategy=strategy, | |
trace_chart_width=trace_chart_width, | |
chart_class=chart_class, | |
**parser_args, | |
) | |
class FeatureTopDownChartParser(FeatureChartParser): | |
def __init__(self, grammar, **parser_args): | |
FeatureChartParser.__init__(self, grammar, TD_FEATURE_STRATEGY, **parser_args) | |
class FeatureBottomUpChartParser(FeatureChartParser): | |
def __init__(self, grammar, **parser_args): | |
FeatureChartParser.__init__(self, grammar, BU_FEATURE_STRATEGY, **parser_args) | |
class FeatureBottomUpLeftCornerChartParser(FeatureChartParser): | |
def __init__(self, grammar, **parser_args): | |
FeatureChartParser.__init__( | |
self, grammar, BU_LC_FEATURE_STRATEGY, **parser_args | |
) | |
# //////////////////////////////////////////////////////////// | |
# Instantiate Variable Chart | |
# //////////////////////////////////////////////////////////// | |
class InstantiateVarsChart(FeatureChart): | |
""" | |
A specialized chart that 'instantiates' variables whose names | |
start with '@', by replacing them with unique new variables. | |
In particular, whenever a complete edge is added to the chart, any | |
variables in the edge's ``lhs`` whose names start with '@' will be | |
replaced by unique new ``Variable``. | |
""" | |
def __init__(self, tokens): | |
FeatureChart.__init__(self, tokens) | |
def initialize(self): | |
self._instantiated = set() | |
FeatureChart.initialize(self) | |
def insert(self, edge, child_pointer_list): | |
if edge in self._instantiated: | |
return False | |
self.instantiate_edge(edge) | |
return FeatureChart.insert(self, edge, child_pointer_list) | |
def instantiate_edge(self, edge): | |
""" | |
If the edge is a ``FeatureTreeEdge``, and it is complete, | |
then instantiate all variables whose names start with '@', | |
by replacing them with unique new variables. | |
Note that instantiation is done in-place, since the | |
parsing algorithms might already hold a reference to | |
the edge for future use. | |
""" | |
# If the edge is a leaf, or is not complete, or is | |
# already in the chart, then just return it as-is. | |
if not isinstance(edge, FeatureTreeEdge): | |
return | |
if not edge.is_complete(): | |
return | |
if edge in self._edge_to_cpls: | |
return | |
# Get a list of variables that need to be instantiated. | |
# If there are none, then return as-is. | |
inst_vars = self.inst_vars(edge) | |
if not inst_vars: | |
return | |
# Instantiate the edge! | |
self._instantiated.add(edge) | |
edge._lhs = edge.lhs().substitute_bindings(inst_vars) | |
def inst_vars(self, edge): | |
return { | |
var: logic.unique_variable() | |
for var in edge.lhs().variables() | |
if var.name.startswith("@") | |
} | |
# //////////////////////////////////////////////////////////// | |
# Demo | |
# //////////////////////////////////////////////////////////// | |
def demo_grammar(): | |
from nltk.grammar import FeatureGrammar | |
return FeatureGrammar.fromstring( | |
""" | |
S -> NP VP | |
PP -> Prep NP | |
NP -> NP PP | |
VP -> VP PP | |
VP -> Verb NP | |
VP -> Verb | |
NP -> Det[pl=?x] Noun[pl=?x] | |
NP -> "John" | |
NP -> "I" | |
Det -> "the" | |
Det -> "my" | |
Det[-pl] -> "a" | |
Noun[-pl] -> "dog" | |
Noun[-pl] -> "cookie" | |
Verb -> "ate" | |
Verb -> "saw" | |
Prep -> "with" | |
Prep -> "under" | |
""" | |
) | |
def demo( | |
print_times=True, | |
print_grammar=True, | |
print_trees=True, | |
print_sentence=True, | |
trace=1, | |
parser=FeatureChartParser, | |
sent="I saw John with a dog with my cookie", | |
): | |
import sys | |
import time | |
print() | |
grammar = demo_grammar() | |
if print_grammar: | |
print(grammar) | |
print() | |
print("*", parser.__name__) | |
if print_sentence: | |
print("Sentence:", sent) | |
tokens = sent.split() | |
t = perf_counter() | |
cp = parser(grammar, trace=trace) | |
chart = cp.chart_parse(tokens) | |
trees = list(chart.parses(grammar.start())) | |
if print_times: | |
print("Time: %s" % (perf_counter() - t)) | |
if print_trees: | |
for tree in trees: | |
print(tree) | |
else: | |
print("Nr trees:", len(trees)) | |
def run_profile(): | |
import profile | |
profile.run("for i in range(1): demo()", "/tmp/profile.out") | |
import pstats | |
p = pstats.Stats("/tmp/profile.out") | |
p.strip_dirs().sort_stats("time", "cum").print_stats(60) | |
p.strip_dirs().sort_stats("cum", "time").print_stats(60) | |
if __name__ == "__main__": | |
from nltk.data import load | |
demo() | |
print() | |
grammar = load("grammars/book_grammars/feat0.fcfg") | |
cp = FeatureChartParser(grammar, trace=2) | |
sent = "Kim likes children" | |
tokens = sent.split() | |
trees = cp.parse(tokens) | |
for tree in trees: | |
print(tree) | |