|
|
|
|
|
|
|
"""This module defines the data structures used to represent a grammar. |
|
|
|
These are a bit arcane because they are derived from the data |
|
structures used by Python's 'pgen' parser generator. |
|
|
|
There's also a table here mapping operators to their names in the |
|
token module; the Python tokenize module reports all operators as the |
|
fallback token code OP, but the parser needs the actual token code. |
|
|
|
""" |
|
|
|
|
|
import pickle |
|
|
|
|
|
from . import token |
|
|
|
|
|
class Grammar(object): |
|
"""Pgen parsing tables conversion class. |
|
|
|
Once initialized, this class supplies the grammar tables for the |
|
parsing engine implemented by parse.py. The parsing engine |
|
accesses the instance variables directly. The class here does not |
|
provide initialization of the tables; several subclasses exist to |
|
do this (see the conv and pgen modules). |
|
|
|
The load() method reads the tables from a pickle file, which is |
|
much faster than the other ways offered by subclasses. The pickle |
|
file is written by calling dump() (after loading the grammar |
|
tables using a subclass). The report() method prints a readable |
|
representation of the tables to stdout, for debugging. |
|
|
|
The instance variables are as follows: |
|
|
|
symbol2number -- a dict mapping symbol names to numbers. Symbol |
|
numbers are always 256 or higher, to distinguish |
|
them from token numbers, which are between 0 and |
|
255 (inclusive). |
|
|
|
number2symbol -- a dict mapping numbers to symbol names; |
|
these two are each other's inverse. |
|
|
|
states -- a list of DFAs, where each DFA is a list of |
|
states, each state is a list of arcs, and each |
|
arc is a (i, j) pair where i is a label and j is |
|
a state number. The DFA number is the index into |
|
this list. (This name is slightly confusing.) |
|
Final states are represented by a special arc of |
|
the form (0, j) where j is its own state number. |
|
|
|
dfas -- a dict mapping symbol numbers to (DFA, first) |
|
pairs, where DFA is an item from the states list |
|
above, and first is a set of tokens that can |
|
begin this grammar rule (represented by a dict |
|
whose values are always 1). |
|
|
|
labels -- a list of (x, y) pairs where x is either a token |
|
number or a symbol number, and y is either None |
|
or a string; the strings are keywords. The label |
|
number is the index in this list; label numbers |
|
are used to mark state transitions (arcs) in the |
|
DFAs. |
|
|
|
start -- the number of the grammar's start symbol. |
|
|
|
keywords -- a dict mapping keyword strings to arc labels. |
|
|
|
tokens -- a dict mapping token numbers to arc labels. |
|
|
|
""" |
|
|
|
def __init__(self): |
|
self.symbol2number = {} |
|
self.number2symbol = {} |
|
self.states = [] |
|
self.dfas = {} |
|
self.labels = [(0, "EMPTY")] |
|
self.keywords = {} |
|
self.tokens = {} |
|
self.symbol2label = {} |
|
self.start = 256 |
|
|
|
def dump(self, filename): |
|
"""Dump the grammar tables to a pickle file.""" |
|
with open(filename, "wb") as f: |
|
pickle.dump(self.__dict__, f, pickle.HIGHEST_PROTOCOL) |
|
|
|
def load(self, filename): |
|
"""Load the grammar tables from a pickle file.""" |
|
with open(filename, "rb") as f: |
|
d = pickle.load(f) |
|
self.__dict__.update(d) |
|
|
|
def loads(self, pkl): |
|
"""Load the grammar tables from a pickle bytes object.""" |
|
self.__dict__.update(pickle.loads(pkl)) |
|
|
|
def copy(self): |
|
""" |
|
Copy the grammar. |
|
""" |
|
new = self.__class__() |
|
for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords", |
|
"tokens", "symbol2label"): |
|
setattr(new, dict_attr, getattr(self, dict_attr).copy()) |
|
new.labels = self.labels[:] |
|
new.states = self.states[:] |
|
new.start = self.start |
|
return new |
|
|
|
def report(self): |
|
"""Dump the grammar tables to standard output, for debugging.""" |
|
from pprint import pprint |
|
print("s2n") |
|
pprint(self.symbol2number) |
|
print("n2s") |
|
pprint(self.number2symbol) |
|
print("states") |
|
pprint(self.states) |
|
print("dfas") |
|
pprint(self.dfas) |
|
print("labels") |
|
pprint(self.labels) |
|
print("start", self.start) |
|
|
|
|
|
|
|
|
|
opmap_raw = """ |
|
( LPAR |
|
) RPAR |
|
[ LSQB |
|
] RSQB |
|
: COLON |
|
, COMMA |
|
; SEMI |
|
+ PLUS |
|
- MINUS |
|
* STAR |
|
/ SLASH |
|
| VBAR |
|
& AMPER |
|
< LESS |
|
> GREATER |
|
= EQUAL |
|
. DOT |
|
% PERCENT |
|
` BACKQUOTE |
|
{ LBRACE |
|
} RBRACE |
|
@ AT |
|
@= ATEQUAL |
|
== EQEQUAL |
|
!= NOTEQUAL |
|
<> NOTEQUAL |
|
<= LESSEQUAL |
|
>= GREATEREQUAL |
|
~ TILDE |
|
^ CIRCUMFLEX |
|
<< LEFTSHIFT |
|
>> RIGHTSHIFT |
|
** DOUBLESTAR |
|
+= PLUSEQUAL |
|
-= MINEQUAL |
|
*= STAREQUAL |
|
/= SLASHEQUAL |
|
%= PERCENTEQUAL |
|
&= AMPEREQUAL |
|
|= VBAREQUAL |
|
^= CIRCUMFLEXEQUAL |
|
<<= LEFTSHIFTEQUAL |
|
>>= RIGHTSHIFTEQUAL |
|
**= DOUBLESTAREQUAL |
|
// DOUBLESLASH |
|
//= DOUBLESLASHEQUAL |
|
-> RARROW |
|
:= COLONEQUAL |
|
""" |
|
|
|
opmap = {} |
|
for line in opmap_raw.splitlines(): |
|
if line: |
|
op, name = line.split() |
|
opmap[op] = getattr(token, name) |
|
|