Spaces:
Running
Running
Update parser.py
Browse files
parser.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
# parser.py
|
2 |
import ast
|
3 |
|
4 |
-
def get_category(node):
|
5 |
-
"""Determine the category of an AST node or variable context."""
|
6 |
if isinstance(node, (ast.Import, ast.ImportFrom)):
|
7 |
return 'import'
|
8 |
elif isinstance(node, ast.FunctionDef) or isinstance(node, ast.AsyncFunctionDef):
|
@@ -24,32 +24,19 @@ def get_category(node):
|
|
24 |
elif isinstance(node, ast.ExceptHandler):
|
25 |
return 'except'
|
26 |
elif isinstance(node, (ast.Assign, ast.AnnAssign, ast.AugAssign)):
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
def get_variable_role(node, parent):
|
32 |
-
"""Determine the role of a variable (input, assigned, returned) based on context."""
|
33 |
-
if isinstance(parent, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
34 |
-
if isinstance(node, ast.arg):
|
35 |
return 'input_variable'
|
36 |
-
|
|
|
37 |
return 'returned_variable'
|
38 |
-
|
39 |
-
|
40 |
-
# Handle different target structures
|
41 |
-
if isinstance(parent, ast.Assign):
|
42 |
-
for target in parent.targets:
|
43 |
-
if isinstance(target, ast.Name) and target.id == node.id:
|
44 |
-
return 'assigned_variable'
|
45 |
-
elif isinstance(parent, (ast.AnnAssign, ast.AugAssign)):
|
46 |
-
target = parent.target
|
47 |
-
if isinstance(target, ast.Name) and target.id == node.id:
|
48 |
-
return 'assigned_variable'
|
49 |
-
return None
|
50 |
|
51 |
-
def create_vector(category, level, location, total_lines, parent_path
|
52 |
-
"""Create a vector optimized for role similarity,
|
53 |
category_map = {
|
54 |
'import': 1, 'function': 2, 'async_function': 3, 'class': 4,
|
55 |
'if': 5, 'while': 6, 'for': 7, 'try': 8, 'expression': 9, 'spacer': 10,
|
@@ -63,16 +50,7 @@ def create_vector(category, level, location, total_lines, parent_path, variable_
|
|
63 |
parent_depth = len(parent_path)
|
64 |
parent_weight = sum(category_map.get(parent.split('[')[0].lower(), 0) * (1 / (i + 1))
|
65 |
for i, parent in enumerate(parent_path)) / max(1, len(category_map))
|
66 |
-
|
67 |
-
# Extend vector with variable-specific info if applicable
|
68 |
-
variable_role_id = 0
|
69 |
-
variable_name = ''
|
70 |
-
if variable_info:
|
71 |
-
role_map = {'input_variable': 1, 'assigned_variable': 2, 'returned_variable': 3}
|
72 |
-
variable_role_id = role_map.get(variable_info['role'], 0)
|
73 |
-
variable_name = variable_info['name'][:10] # Truncate for brevity, hash if needed
|
74 |
-
|
75 |
-
return [category_id, level, center_pos, span, parent_depth, parent_weight, variable_role_id, hash(variable_name) % 1000 / 1000]
|
76 |
|
77 |
def is_blank_or_comment(line):
|
78 |
"""Check if a line is blank or a comment."""
|
@@ -142,13 +120,12 @@ def parse_node(node, lines, prev_end, level=0, total_lines=None, parent_path=Non
|
|
142 |
if var_start not in processed_lines:
|
143 |
counters['input_variable'] += 1
|
144 |
var_node_id = f"InputVariable[{counters['input_variable']}]"
|
145 |
-
var_info = {'role': 'input_variable', 'name': arg.arg}
|
146 |
parts.append({
|
147 |
'category': 'input_variable',
|
148 |
'source': f" {arg.arg},", # Indented as part of function
|
149 |
'location': (var_start, var_start),
|
150 |
'level': level + 1,
|
151 |
-
'vector': create_vector('input_variable', level + 1, (var_start, var_start), total_lines, current_path
|
152 |
'parent_path': f"{current_path[0]} -> {var_node_id}",
|
153 |
'node_id': var_node_id
|
154 |
})
|
@@ -216,42 +193,42 @@ def parse_node(node, lines, prev_end, level=0, total_lines=None, parent_path=Non
|
|
216 |
nested_prev_end = max(nested_prev_end, child_parts[-1]['location'][1] if child_parts else child_start)
|
217 |
else:
|
218 |
# Handle assignments and returns for variable detection
|
219 |
-
if isinstance(child, ast.Assign):
|
220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
if isinstance(target, ast.Name):
|
222 |
var_start = child.lineno
|
223 |
if var_start not in processed_lines and not is_blank_or_comment(lines[var_start - 1]):
|
224 |
counters['assigned_variable'] += 1
|
225 |
var_node_id = f"AssignedVariable[{counters['assigned_variable']}]"
|
226 |
-
var_info = {'role': 'assigned_variable', 'name': target.id}
|
227 |
parts.append({
|
228 |
'category': 'assigned_variable',
|
229 |
'source': lines[var_start - 1],
|
230 |
'location': (var_start, var_start),
|
231 |
'level': level + 1,
|
232 |
-
'vector': create_vector('assigned_variable', level + 1, (var_start, var_start), total_lines, current_path
|
233 |
'parent_path': f"{current_path[0]} -> {var_node_id}",
|
234 |
'node_id': var_node_id
|
235 |
})
|
236 |
processed_lines.add(var_start)
|
237 |
-
elif isinstance(child, ast.AnnAssign) or isinstance(child, ast.AugAssign):
|
238 |
-
target = child.target
|
239 |
-
if isinstance(target, ast.Name):
|
240 |
-
var_start = child.lineno
|
241 |
-
if var_start not in processed_lines and not is_blank_or_comment(lines[var_start - 1]):
|
242 |
-
counters['assigned_variable'] += 1
|
243 |
-
var_node_id = f"AssignedVariable[{counters['assigned_variable']}]"
|
244 |
-
var_info = {'role': 'assigned_variable', 'name': target.id}
|
245 |
-
parts.append({
|
246 |
-
'category': 'assigned_variable',
|
247 |
-
'source': lines[var_start - 1],
|
248 |
-
'location': (var_start, var_start),
|
249 |
-
'level': level + 1,
|
250 |
-
'vector': create_vector('assigned_variable', level + 1, (var_start, var_start), total_lines, current_path, var_info),
|
251 |
-
'parent_path': f"{current_path[0]} -> {var_node_id}",
|
252 |
-
'node_id': var_node_id
|
253 |
-
})
|
254 |
-
processed_lines.add(var_start)
|
255 |
elif isinstance(child, ast.Return):
|
256 |
for value in ast.walk(child):
|
257 |
if isinstance(value, ast.Name):
|
@@ -259,13 +236,12 @@ def parse_node(node, lines, prev_end, level=0, total_lines=None, parent_path=Non
|
|
259 |
if var_start not in processed_lines and not is_blank_or_comment(lines[var_start - 1]):
|
260 |
counters['returned_variable'] += 1
|
261 |
var_node_id = f"ReturnedVariable[{counters['returned_variable']}]"
|
262 |
-
var_info = {'role': 'returned_variable', 'name': value.id}
|
263 |
parts.append({
|
264 |
'category': 'returned_variable',
|
265 |
'source': lines[var_start - 1],
|
266 |
'location': (var_start, var_start),
|
267 |
'level': level + 1,
|
268 |
-
'vector': create_vector('returned_variable', level + 1, (var_start, var_start), total_lines, current_path
|
269 |
'parent_path': f"{current_path[0]} -> {var_node_id}",
|
270 |
'node_id': var_node_id
|
271 |
})
|
@@ -291,7 +267,7 @@ def parse_python_code(code):
|
|
291 |
try:
|
292 |
tree = ast.parse(code)
|
293 |
except SyntaxError:
|
294 |
-
return [{'category': 'error', 'source': 'Invalid Python code', 'location': (1, 1), 'level': 0, 'vector': [0, 0, 1.0, 0.0, 0, 0
|
295 |
|
296 |
parts = []
|
297 |
prev_end = 0
|
|
|
1 |
# parser.py
|
2 |
import ast
|
3 |
|
4 |
+
def get_category(node, parent=None):
|
5 |
+
"""Determine the category of an AST node or variable context, including variable roles."""
|
6 |
if isinstance(node, (ast.Import, ast.ImportFrom)):
|
7 |
return 'import'
|
8 |
elif isinstance(node, ast.FunctionDef) or isinstance(node, ast.AsyncFunctionDef):
|
|
|
24 |
elif isinstance(node, ast.ExceptHandler):
|
25 |
return 'except'
|
26 |
elif isinstance(node, (ast.Assign, ast.AnnAssign, ast.AugAssign)):
|
27 |
+
if parent and isinstance(parent, (ast.FunctionDef, ast.AsyncFunctionDef, ast.If, ast.Try, ast.While, ast.For)):
|
28 |
+
return 'assigned_variable'
|
29 |
+
elif isinstance(node, ast.arg): # Input variables in function definitions
|
30 |
+
if parent and isinstance(parent, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
|
|
|
|
|
|
|
31 |
return 'input_variable'
|
32 |
+
elif isinstance(node, ast.Name): # Returned variables in return statements
|
33 |
+
if parent and isinstance(parent, ast.Return):
|
34 |
return 'returned_variable'
|
35 |
+
else:
|
36 |
+
return 'other'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
+
def create_vector(category, level, location, total_lines, parent_path):
|
39 |
+
"""Create a 6D vector optimized for role similarity, integrating variable roles into category_id."""
|
40 |
category_map = {
|
41 |
'import': 1, 'function': 2, 'async_function': 3, 'class': 4,
|
42 |
'if': 5, 'while': 6, 'for': 7, 'try': 8, 'expression': 9, 'spacer': 10,
|
|
|
50 |
parent_depth = len(parent_path)
|
51 |
parent_weight = sum(category_map.get(parent.split('[')[0].lower(), 0) * (1 / (i + 1))
|
52 |
for i, parent in enumerate(parent_path)) / max(1, len(category_map))
|
53 |
+
return [category_id, level, center_pos, span, parent_depth, parent_weight]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
def is_blank_or_comment(line):
|
56 |
"""Check if a line is blank or a comment."""
|
|
|
120 |
if var_start not in processed_lines:
|
121 |
counters['input_variable'] += 1
|
122 |
var_node_id = f"InputVariable[{counters['input_variable']}]"
|
|
|
123 |
parts.append({
|
124 |
'category': 'input_variable',
|
125 |
'source': f" {arg.arg},", # Indented as part of function
|
126 |
'location': (var_start, var_start),
|
127 |
'level': level + 1,
|
128 |
+
'vector': create_vector('input_variable', level + 1, (var_start, var_start), total_lines, current_path),
|
129 |
'parent_path': f"{current_path[0]} -> {var_node_id}",
|
130 |
'node_id': var_node_id
|
131 |
})
|
|
|
193 |
nested_prev_end = max(nested_prev_end, child_parts[-1]['location'][1] if child_parts else child_start)
|
194 |
else:
|
195 |
# Handle assignments and returns for variable detection
|
196 |
+
if isinstance(child, (ast.Assign, ast.AnnAssign, ast.AugAssign)):
|
197 |
+
# Handle different target structures
|
198 |
+
if isinstance(child, ast.Assign):
|
199 |
+
for target in child.targets:
|
200 |
+
if isinstance(target, ast.Name):
|
201 |
+
var_start = child.lineno
|
202 |
+
if var_start not in processed_lines and not is_blank_or_comment(lines[var_start - 1]):
|
203 |
+
counters['assigned_variable'] += 1
|
204 |
+
var_node_id = f"AssignedVariable[{counters['assigned_variable']}]"
|
205 |
+
parts.append({
|
206 |
+
'category': 'assigned_variable',
|
207 |
+
'source': lines[var_start - 1],
|
208 |
+
'location': (var_start, var_start),
|
209 |
+
'level': level + 1,
|
210 |
+
'vector': create_vector('assigned_variable', level + 1, (var_start, var_start), total_lines, current_path),
|
211 |
+
'parent_path': f"{current_path[0]} -> {var_node_id}",
|
212 |
+
'node_id': var_node_id
|
213 |
+
})
|
214 |
+
processed_lines.add(var_start)
|
215 |
+
else: # AnnAssign or AugAssign
|
216 |
+
target = child.target
|
217 |
if isinstance(target, ast.Name):
|
218 |
var_start = child.lineno
|
219 |
if var_start not in processed_lines and not is_blank_or_comment(lines[var_start - 1]):
|
220 |
counters['assigned_variable'] += 1
|
221 |
var_node_id = f"AssignedVariable[{counters['assigned_variable']}]"
|
|
|
222 |
parts.append({
|
223 |
'category': 'assigned_variable',
|
224 |
'source': lines[var_start - 1],
|
225 |
'location': (var_start, var_start),
|
226 |
'level': level + 1,
|
227 |
+
'vector': create_vector('assigned_variable', level + 1, (var_start, var_start), total_lines, current_path),
|
228 |
'parent_path': f"{current_path[0]} -> {var_node_id}",
|
229 |
'node_id': var_node_id
|
230 |
})
|
231 |
processed_lines.add(var_start)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
elif isinstance(child, ast.Return):
|
233 |
for value in ast.walk(child):
|
234 |
if isinstance(value, ast.Name):
|
|
|
236 |
if var_start not in processed_lines and not is_blank_or_comment(lines[var_start - 1]):
|
237 |
counters['returned_variable'] += 1
|
238 |
var_node_id = f"ReturnedVariable[{counters['returned_variable']}]"
|
|
|
239 |
parts.append({
|
240 |
'category': 'returned_variable',
|
241 |
'source': lines[var_start - 1],
|
242 |
'location': (var_start, var_start),
|
243 |
'level': level + 1,
|
244 |
+
'vector': create_vector('returned_variable', level + 1, (var_start, var_start), total_lines, current_path),
|
245 |
'parent_path': f"{current_path[0]} -> {var_node_id}",
|
246 |
'node_id': var_node_id
|
247 |
})
|
|
|
267 |
try:
|
268 |
tree = ast.parse(code)
|
269 |
except SyntaxError:
|
270 |
+
return [{'category': 'error', 'source': 'Invalid Python code', 'location': (1, 1), 'level': 0, 'vector': [0, 0, 1.0, 0.0, 0, 0], 'parent_path': 'Top-Level', 'node_id': 'Error[1]'}]
|
271 |
|
272 |
parts = []
|
273 |
prev_end = 0
|