Spaces:
Running
Running
Update parser.py
Browse files
parser.py
CHANGED
@@ -2,15 +2,11 @@
|
|
2 |
import ast
|
3 |
|
4 |
def get_category(node):
|
5 |
-
"""Determine the category of an AST node."""
|
6 |
if isinstance(node, (ast.Import, ast.ImportFrom)):
|
7 |
return 'import'
|
8 |
-
elif isinstance(node,
|
9 |
-
return 'assignment'
|
10 |
-
elif isinstance(node, ast.FunctionDef):
|
11 |
return 'function'
|
12 |
-
elif isinstance(node, ast.AsyncFunctionDef):
|
13 |
-
return 'async_function'
|
14 |
elif isinstance(node, ast.ClassDef):
|
15 |
return 'class'
|
16 |
elif isinstance(node, ast.If):
|
@@ -27,15 +23,30 @@ def get_category(node):
|
|
27 |
return 'expression'
|
28 |
elif isinstance(node, ast.ExceptHandler):
|
29 |
return 'except'
|
|
|
|
|
30 |
else:
|
31 |
return 'other'
|
32 |
|
33 |
-
def
|
34 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
category_map = {
|
36 |
-
'import': 1, '
|
37 |
-
'if':
|
38 |
-
'other':
|
|
|
39 |
}
|
40 |
category_id = category_map.get(category, 0)
|
41 |
start_line, end_line = location
|
@@ -44,7 +55,16 @@ def create_vector(category, level, location, total_lines, parent_path):
|
|
44 |
parent_depth = len(parent_path)
|
45 |
parent_weight = sum(category_map.get(parent.split('[')[0].lower(), 0) * (1 / (i + 1))
|
46 |
for i, parent in enumerate(parent_path)) / max(1, len(category_map))
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
def is_blank_or_comment(line):
|
50 |
"""Check if a line is blank or a comment."""
|
@@ -57,7 +77,7 @@ def parse_node(node, lines, prev_end, level=0, total_lines=None, parent_path=Non
|
|
57 |
if parent_path is None:
|
58 |
parent_path = []
|
59 |
if counters is None:
|
60 |
-
counters = {cat: 0 for cat in ['import', '
|
61 |
if processed_lines is None:
|
62 |
processed_lines = set()
|
63 |
|
@@ -107,6 +127,25 @@ def parse_node(node, lines, prev_end, level=0, total_lines=None, parent_path=Non
|
|
107 |
})
|
108 |
processed_lines.add(start_line)
|
109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
# Process nested bodies
|
111 |
nested_prev_end = start_line
|
112 |
for attr in ('body', 'orelse', 'handlers', 'finalbody'):
|
@@ -168,6 +207,43 @@ def parse_node(node, lines, prev_end, level=0, total_lines=None, parent_path=Non
|
|
168 |
parts.extend(child_parts)
|
169 |
nested_prev_end = max(nested_prev_end, child_parts[-1]['location'][1] if child_parts else child_start)
|
170 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
child_parts = parse_node(child, lines, nested_prev_end, level + 1, total_lines, current_path, counters, processed_lines)
|
172 |
parts.extend(child_parts)
|
173 |
nested_prev_end = child_parts[-1]['location'][1] if child_parts else nested_prev_end
|
|
|
2 |
import ast
|
3 |
|
4 |
def get_category(node):
|
5 |
+
"""Determine the category of an AST node or variable context."""
|
6 |
if isinstance(node, (ast.Import, ast.ImportFrom)):
|
7 |
return 'import'
|
8 |
+
elif isinstance(node, ast.FunctionDef) or isinstance(node, ast.AsyncFunctionDef):
|
|
|
|
|
9 |
return 'function'
|
|
|
|
|
10 |
elif isinstance(node, ast.ClassDef):
|
11 |
return 'class'
|
12 |
elif isinstance(node, ast.If):
|
|
|
23 |
return 'expression'
|
24 |
elif isinstance(node, ast.ExceptHandler):
|
25 |
return 'except'
|
26 |
+
elif isinstance(node, ast.Assign) or isinstance(node, ast.AnnAssign) or isinstance(node, ast.AugAssign):
|
27 |
+
return 'assigned_variable'
|
28 |
else:
|
29 |
return 'other'
|
30 |
|
31 |
+
def get_variable_role(node, parent):
|
32 |
+
"""Determine the role of a variable (input, assigned, returned) based on context."""
|
33 |
+
if isinstance(parent, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
34 |
+
if isinstance(node, ast.arg):
|
35 |
+
return 'input_variable'
|
36 |
+
elif isinstance(parent, ast.Return) and isinstance(node, ast.Name):
|
37 |
+
return 'returned_variable'
|
38 |
+
elif isinstance(parent, (ast.Assign, ast.AnnAssign, ast.AugAssign)):
|
39 |
+
if isinstance(node, ast.Name) and node.id in [target.id for target in parent.targets if isinstance(target, ast.Name)]:
|
40 |
+
return 'assigned_variable'
|
41 |
+
return None
|
42 |
+
|
43 |
+
def create_vector(category, level, location, total_lines, parent_path, variable_info=None):
|
44 |
+
"""Create a vector optimized for role similarity, including variable info if applicable."""
|
45 |
category_map = {
|
46 |
+
'import': 1, 'function': 2, 'async_function': 3, 'class': 4,
|
47 |
+
'if': 5, 'while': 6, 'for': 7, 'try': 8, 'expression': 9, 'spacer': 10,
|
48 |
+
'other': 11, 'elif': 12, 'else': 13, 'except': 14, 'finally': 15, 'return': 16,
|
49 |
+
'assigned_variable': 17, 'input_variable': 18, 'returned_variable': 19
|
50 |
}
|
51 |
category_id = category_map.get(category, 0)
|
52 |
start_line, end_line = location
|
|
|
55 |
parent_depth = len(parent_path)
|
56 |
parent_weight = sum(category_map.get(parent.split('[')[0].lower(), 0) * (1 / (i + 1))
|
57 |
for i, parent in enumerate(parent_path)) / max(1, len(category_map))
|
58 |
+
|
59 |
+
# Extend vector with variable-specific info if applicable
|
60 |
+
variable_role_id = 0
|
61 |
+
variable_name = ''
|
62 |
+
if variable_info:
|
63 |
+
role_map = {'input_variable': 1, 'assigned_variable': 2, 'returned_variable': 3}
|
64 |
+
variable_role_id = role_map.get(variable_info['role'], 0)
|
65 |
+
variable_name = variable_info['name'][:10] # Truncate for brevity, hash if needed
|
66 |
+
|
67 |
+
return [category_id, level, center_pos, span, parent_depth, parent_weight, variable_role_id, hash(variable_name) % 1000 / 1000]
|
68 |
|
69 |
def is_blank_or_comment(line):
|
70 |
"""Check if a line is blank or a comment."""
|
|
|
77 |
if parent_path is None:
|
78 |
parent_path = []
|
79 |
if counters is None:
|
80 |
+
counters = {cat: 0 for cat in ['import', 'function', 'async_function', 'class', 'if', 'while', 'for', 'try', 'return', 'expression', 'other', 'spacer', 'elif', 'else', 'except', 'finally', 'assigned_variable', 'input_variable', 'returned_variable']}
|
81 |
if processed_lines is None:
|
82 |
processed_lines = set()
|
83 |
|
|
|
127 |
})
|
128 |
processed_lines.add(start_line)
|
129 |
|
130 |
+
# Handle variables in function definitions (input variables)
|
131 |
+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and node.args.args:
|
132 |
+
for arg in node.args.args:
|
133 |
+
var_start = start_line # Assume args are on the same line as function def for simplicity
|
134 |
+
if var_start not in processed_lines:
|
135 |
+
counters['input_variable'] += 1
|
136 |
+
var_node_id = f"InputVariable[{counters['input_variable']}]"
|
137 |
+
var_info = {'role': 'input_variable', 'name': arg.arg}
|
138 |
+
parts.append({
|
139 |
+
'category': 'input_variable',
|
140 |
+
'source': f" {arg.arg},", # Indented as part of function
|
141 |
+
'location': (var_start, var_start),
|
142 |
+
'level': level + 1,
|
143 |
+
'vector': create_vector('input_variable', level + 1, (var_start, var_start), total_lines, current_path, var_info),
|
144 |
+
'parent_path': f"{current_path[0]} -> {var_node_id}",
|
145 |
+
'node_id': var_node_id
|
146 |
+
})
|
147 |
+
processed_lines.add(var_start)
|
148 |
+
|
149 |
# Process nested bodies
|
150 |
nested_prev_end = start_line
|
151 |
for attr in ('body', 'orelse', 'handlers', 'finalbody'):
|
|
|
207 |
parts.extend(child_parts)
|
208 |
nested_prev_end = max(nested_prev_end, child_parts[-1]['location'][1] if child_parts else child_start)
|
209 |
else:
|
210 |
+
# Handle assignments and returns for variable detection
|
211 |
+
if isinstance(child, ast.Assign) or isinstance(child, ast.AnnAssign) or isinstance(child, ast.AugAssign):
|
212 |
+
for target in child.targets:
|
213 |
+
if isinstance(target, ast.Name):
|
214 |
+
var_start = child.lineno
|
215 |
+
if var_start not in processed_lines and not is_blank_or_comment(lines[var_start - 1]):
|
216 |
+
counters['assigned_variable'] += 1
|
217 |
+
var_node_id = f"AssignedVariable[{counters['assigned_variable']}]"
|
218 |
+
var_info = {'role': 'assigned_variable', 'name': target.id}
|
219 |
+
parts.append({
|
220 |
+
'category': 'assigned_variable',
|
221 |
+
'source': lines[var_start - 1],
|
222 |
+
'location': (var_start, var_start),
|
223 |
+
'level': level + 1,
|
224 |
+
'vector': create_vector('assigned_variable', level + 1, (var_start, var_start), total_lines, current_path, var_info),
|
225 |
+
'parent_path': f"{current_path[0]} -> {var_node_id}",
|
226 |
+
'node_id': var_node_id
|
227 |
+
})
|
228 |
+
processed_lines.add(var_start)
|
229 |
+
elif isinstance(child, ast.Return):
|
230 |
+
for value in ast.walk(child):
|
231 |
+
if isinstance(value, ast.Name):
|
232 |
+
var_start = child.lineno
|
233 |
+
if var_start not in processed_lines and not is_blank_or_comment(lines[var_start - 1]):
|
234 |
+
counters['returned_variable'] += 1
|
235 |
+
var_node_id = f"ReturnedVariable[{counters['returned_variable']}]"
|
236 |
+
var_info = {'role': 'returned_variable', 'name': value.id}
|
237 |
+
parts.append({
|
238 |
+
'category': 'returned_variable',
|
239 |
+
'source': lines[var_start - 1],
|
240 |
+
'location': (var_start, var_start),
|
241 |
+
'level': level + 1,
|
242 |
+
'vector': create_vector('returned_variable', level + 1, (var_start, var_start), total_lines, current_path, var_info),
|
243 |
+
'parent_path': f"{current_path[0]} -> {var_node_id}",
|
244 |
+
'node_id': var_node_id
|
245 |
+
})
|
246 |
+
processed_lines.add(var_start)
|
247 |
child_parts = parse_node(child, lines, nested_prev_end, level + 1, total_lines, current_path, counters, processed_lines)
|
248 |
parts.extend(child_parts)
|
249 |
nested_prev_end = child_parts[-1]['location'][1] if child_parts else nested_prev_end
|