File size: 8,108 Bytes
d1ceb73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
from parso.python import tree
from parso.python.token import PythonTokenTypes
from parso.parser import BaseParser


NAME = PythonTokenTypes.NAME
INDENT = PythonTokenTypes.INDENT
DEDENT = PythonTokenTypes.DEDENT


class Parser(BaseParser):
    """
    This class is used to parse a Python file, it then divides them into a
    class structure of different scopes.

    :param pgen_grammar: The grammar object of pgen2. Loaded by load_grammar.
    """

    node_map = {
        'expr_stmt': tree.ExprStmt,
        'classdef': tree.Class,
        'funcdef': tree.Function,
        'file_input': tree.Module,
        'import_name': tree.ImportName,
        'import_from': tree.ImportFrom,
        'break_stmt': tree.KeywordStatement,
        'continue_stmt': tree.KeywordStatement,
        'return_stmt': tree.ReturnStmt,
        'raise_stmt': tree.KeywordStatement,
        'yield_expr': tree.YieldExpr,
        'del_stmt': tree.KeywordStatement,
        'pass_stmt': tree.KeywordStatement,
        'global_stmt': tree.GlobalStmt,
        'nonlocal_stmt': tree.KeywordStatement,
        'print_stmt': tree.KeywordStatement,
        'assert_stmt': tree.AssertStmt,
        'if_stmt': tree.IfStmt,
        'with_stmt': tree.WithStmt,
        'for_stmt': tree.ForStmt,
        'while_stmt': tree.WhileStmt,
        'try_stmt': tree.TryStmt,
        'sync_comp_for': tree.SyncCompFor,
        # Not sure if this is the best idea, but IMO it's the easiest way to
        # avoid extreme amounts of work around the subtle difference of 2/3
        # grammar in list comoprehensions.
        'decorator': tree.Decorator,
        'lambdef': tree.Lambda,
        'lambdef_nocond': tree.Lambda,
        'namedexpr_test': tree.NamedExpr,
    }
    default_node = tree.PythonNode

    # Names/Keywords are handled separately
    _leaf_map = {
        PythonTokenTypes.STRING: tree.String,
        PythonTokenTypes.NUMBER: tree.Number,
        PythonTokenTypes.NEWLINE: tree.Newline,
        PythonTokenTypes.ENDMARKER: tree.EndMarker,
        PythonTokenTypes.FSTRING_STRING: tree.FStringString,
        PythonTokenTypes.FSTRING_START: tree.FStringStart,
        PythonTokenTypes.FSTRING_END: tree.FStringEnd,
    }

    def __init__(self, pgen_grammar, error_recovery=True, start_nonterminal='file_input'):
        super().__init__(pgen_grammar, start_nonterminal,
                         error_recovery=error_recovery)

        self.syntax_errors = []
        self._omit_dedent_list = []
        self._indent_counter = 0

    def parse(self, tokens):
        if self._error_recovery:
            if self._start_nonterminal != 'file_input':
                raise NotImplementedError

            tokens = self._recovery_tokenize(tokens)

        return super().parse(tokens)

    def convert_node(self, nonterminal, children):
        """
        Convert raw node information to a PythonBaseNode instance.

        This is passed to the parser driver which calls it whenever a reduction of a
        grammar rule produces a new complete node, so that the tree is build
        strictly bottom-up.
        """
        try:
            node = self.node_map[nonterminal](children)
        except KeyError:
            if nonterminal == 'suite':
                # We don't want the INDENT/DEDENT in our parser tree. Those
                # leaves are just cancer. They are virtual leaves and not real
                # ones and therefore have pseudo start/end positions and no
                # prefixes. Just ignore them.
                children = [children[0]] + children[2:-1]
            node = self.default_node(nonterminal, children)
        return node

    def convert_leaf(self, type, value, prefix, start_pos):
        # print('leaf', repr(value), token.tok_name[type])
        if type == NAME:
            if value in self._pgen_grammar.reserved_syntax_strings:
                return tree.Keyword(value, start_pos, prefix)
            else:
                return tree.Name(value, start_pos, prefix)

        return self._leaf_map.get(type, tree.Operator)(value, start_pos, prefix)

    def error_recovery(self, token):
        tos_nodes = self.stack[-1].nodes
        if tos_nodes:
            last_leaf = tos_nodes[-1].get_last_leaf()
        else:
            last_leaf = None

        if self._start_nonterminal == 'file_input' and \
                (token.type == PythonTokenTypes.ENDMARKER
                 or token.type == DEDENT and not last_leaf.value.endswith('\n')
                 and not last_leaf.value.endswith('\r')):
            # In Python statements need to end with a newline. But since it's
            # possible (and valid in Python) that there's no newline at the
            # end of a file, we have to recover even if the user doesn't want
            # error recovery.
            if self.stack[-1].dfa.from_rule == 'simple_stmt':
                try:
                    plan = self.stack[-1].dfa.transitions[PythonTokenTypes.NEWLINE]
                except KeyError:
                    pass
                else:
                    if plan.next_dfa.is_final and not plan.dfa_pushes:
                        # We are ignoring here that the newline would be
                        # required for a simple_stmt.
                        self.stack[-1].dfa = plan.next_dfa
                        self._add_token(token)
                        return

        if not self._error_recovery:
            return super().error_recovery(token)

        def current_suite(stack):
            # For now just discard everything that is not a suite or
            # file_input, if we detect an error.
            for until_index, stack_node in reversed(list(enumerate(stack))):
                # `suite` can sometimes be only simple_stmt, not stmt.
                if stack_node.nonterminal == 'file_input':
                    break
                elif stack_node.nonterminal == 'suite':
                    # In the case where we just have a newline we don't want to
                    # do error recovery here. In all other cases, we want to do
                    # error recovery.
                    if len(stack_node.nodes) != 1:
                        break
            return until_index

        until_index = current_suite(self.stack)

        if self._stack_removal(until_index + 1):
            self._add_token(token)
        else:
            typ, value, start_pos, prefix = token
            if typ == INDENT:
                # For every deleted INDENT we have to delete a DEDENT as well.
                # Otherwise the parser will get into trouble and DEDENT too early.
                self._omit_dedent_list.append(self._indent_counter)

            error_leaf = tree.PythonErrorLeaf(typ.name, value, start_pos, prefix)
            self.stack[-1].nodes.append(error_leaf)

        tos = self.stack[-1]
        if tos.nonterminal == 'suite':
            # Need at least one statement in the suite. This happend with the
            # error recovery above.
            try:
                tos.dfa = tos.dfa.arcs['stmt']
            except KeyError:
                # We're already in a final state.
                pass

    def _stack_removal(self, start_index):
        all_nodes = [node for stack_node in self.stack[start_index:] for node in stack_node.nodes]

        if all_nodes:
            node = tree.PythonErrorNode(all_nodes)
            self.stack[start_index - 1].nodes.append(node)

        self.stack[start_index:] = []
        return bool(all_nodes)

    def _recovery_tokenize(self, tokens):
        for token in tokens:
            typ = token[0]
            if typ == DEDENT:
                # We need to count indents, because if we just omit any DEDENT,
                # we might omit them in the wrong place.
                o = self._omit_dedent_list
                if o and o[-1] == self._indent_counter:
                    o.pop()
                    self._indent_counter -= 1
                    continue

                self._indent_counter -= 1
            elif typ == INDENT:
                self._indent_counter += 1
            yield token