Spaces:
Running
Running
Update parser.py
Browse files
parser.py
CHANGED
@@ -24,69 +24,82 @@ def get_category(node):
|
|
24 |
else:
|
25 |
return 'other'
|
26 |
|
27 |
-
def create_vector(category, level, location, total_lines):
|
28 |
-
"""Create a vector representation
|
29 |
-
# Vector: [category_id, level, start_line_normalized, end_line_normalized]
|
30 |
category_map = {
|
31 |
'import': 1, 'assignment': 2, 'function': 3, 'async_function': 4, 'class': 5,
|
32 |
'if': 6, 'while': 7, 'for': 8, 'expression': 9, 'spacer': 10, 'other': 11
|
33 |
}
|
34 |
category_id = category_map.get(category, 0)
|
35 |
start_line, end_line = location
|
|
|
|
|
|
|
36 |
return [
|
37 |
category_id,
|
38 |
level,
|
39 |
-
start_line / total_lines,
|
40 |
-
end_line / total_lines
|
|
|
|
|
41 |
]
|
42 |
|
43 |
-
def parse_node(node, lines, prev_end, level=0, total_lines=None):
|
44 |
-
"""Recursively parse an AST node
|
45 |
if total_lines is None:
|
46 |
total_lines = len(lines)
|
|
|
|
|
|
|
|
|
47 |
|
48 |
parts = []
|
49 |
start_line = getattr(node, 'lineno', prev_end + 1)
|
50 |
end_line = getattr(node, 'end_lineno', start_line)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
# Handle spacers before the node
|
53 |
if start_line > prev_end + 1:
|
54 |
spacer_lines = lines[prev_end:start_line - 1]
|
55 |
-
spacer_vector = create_vector('spacer', level, (prev_end + 1, start_line - 1), total_lines)
|
56 |
parts.append({
|
57 |
'category': 'spacer',
|
58 |
'source': ''.join(spacer_lines),
|
59 |
'location': (prev_end + 1, start_line - 1),
|
60 |
'level': level,
|
61 |
-
'vector': spacer_vector
|
|
|
|
|
62 |
})
|
63 |
|
64 |
# Capture the node's source
|
65 |
stmt_lines = lines[start_line - 1:end_line]
|
66 |
-
|
|
|
67 |
parts.append({
|
68 |
-
'category':
|
69 |
'source': ''.join(stmt_lines),
|
70 |
'location': (start_line, end_line),
|
71 |
'level': level,
|
72 |
-
'vector': node_vector
|
|
|
|
|
73 |
})
|
74 |
|
75 |
-
# Process nested nodes (
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
# Handle additional bodies (e.g., else, elif, orelse for loops)
|
84 |
-
if hasattr(node, 'orelse') and node.orelse:
|
85 |
-
orelse_prev_end = parts[-1]['location'][1]
|
86 |
-
for child in node.orelse:
|
87 |
-
child_parts = parse_node(child, lines, orelse_prev_end, level + 1, total_lines)
|
88 |
-
parts.extend(child_parts)
|
89 |
-
orelse_prev_end = child_parts[-1]['location'][1]
|
90 |
|
91 |
return parts
|
92 |
|
@@ -97,7 +110,7 @@ def parse_python_code(code):
|
|
97 |
try:
|
98 |
tree = ast.parse(code)
|
99 |
except SyntaxError:
|
100 |
-
return [{'category': 'error', 'source': 'Invalid Python code', 'location': (1, 1), 'level': 0, 'vector': [0, 0, 1.0, 1.0]}]
|
101 |
|
102 |
parts = []
|
103 |
prev_end = 0
|
@@ -107,16 +120,17 @@ def parse_python_code(code):
|
|
107 |
parts.extend(stmt_parts)
|
108 |
prev_end = stmt_parts[-1]['location'][1]
|
109 |
|
110 |
-
# Capture trailing spacers
|
111 |
if prev_end < total_lines:
|
112 |
remaining_lines = lines[prev_end:]
|
113 |
-
spacer_vector = create_vector('spacer', 0, (prev_end + 1, total_lines + 1), total_lines)
|
114 |
parts.append({
|
115 |
'category': 'spacer',
|
116 |
'source': ''.join(remaining_lines),
|
117 |
'location': (prev_end + 1, total_lines + 1),
|
118 |
'level': 0,
|
119 |
-
'vector': spacer_vector
|
|
|
|
|
120 |
})
|
121 |
|
122 |
return parts
|
|
|
24 |
else:
|
25 |
return 'other'
|
26 |
|
27 |
+
def create_vector(category, level, location, total_lines, parent_path):
|
28 |
+
"""Create a vector representation including hierarchy info."""
|
|
|
29 |
category_map = {
|
30 |
'import': 1, 'assignment': 2, 'function': 3, 'async_function': 4, 'class': 5,
|
31 |
'if': 6, 'while': 7, 'for': 8, 'expression': 9, 'spacer': 10, 'other': 11
|
32 |
}
|
33 |
category_id = category_map.get(category, 0)
|
34 |
start_line, end_line = location
|
35 |
+
# Extend vector with parent path length and a hash of the path for uniqueness
|
36 |
+
parent_depth = len(parent_path)
|
37 |
+
parent_hash = hash(' -> '.join(parent_path)) % 1000 # Simple hash for vector
|
38 |
return [
|
39 |
category_id,
|
40 |
level,
|
41 |
+
start_line / total_lines,
|
42 |
+
end_line / total_lines,
|
43 |
+
parent_depth,
|
44 |
+
parent_hash / 1000 # Normalized hash (0.0 to 1.0)
|
45 |
]
|
46 |
|
47 |
+
def parse_node(node, lines, prev_end, level=0, total_lines=None, parent_path=None, counters=None):
|
48 |
+
"""Recursively parse an AST node with full hierarchy tracking."""
|
49 |
if total_lines is None:
|
50 |
total_lines = len(lines)
|
51 |
+
if parent_path is None:
|
52 |
+
parent_path = []
|
53 |
+
if counters is None:
|
54 |
+
counters = {'if': 0, 'while': 0, 'for': 0, 'function': 0, 'class': 0} # Track counts per type
|
55 |
|
56 |
parts = []
|
57 |
start_line = getattr(node, 'lineno', prev_end + 1)
|
58 |
end_line = getattr(node, 'end_lineno', start_line)
|
59 |
+
category = get_category(node)
|
60 |
+
|
61 |
+
# Assign a node ID for relevant categories
|
62 |
+
node_id = ''
|
63 |
+
if category in counters:
|
64 |
+
counters[category] += 1
|
65 |
+
node_id = f"{category.capitalize()}[{counters[category]}]"
|
66 |
|
67 |
# Handle spacers before the node
|
68 |
if start_line > prev_end + 1:
|
69 |
spacer_lines = lines[prev_end:start_line - 1]
|
70 |
+
spacer_vector = create_vector('spacer', level, (prev_end + 1, start_line - 1), total_lines, parent_path)
|
71 |
parts.append({
|
72 |
'category': 'spacer',
|
73 |
'source': ''.join(spacer_lines),
|
74 |
'location': (prev_end + 1, start_line - 1),
|
75 |
'level': level,
|
76 |
+
'vector': spacer_vector,
|
77 |
+
'parent_path': ' -> '.join(parent_path) if parent_path else 'Top-Level',
|
78 |
+
'node_id': ''
|
79 |
})
|
80 |
|
81 |
# Capture the node's source
|
82 |
stmt_lines = lines[start_line - 1:end_line]
|
83 |
+
current_path = parent_path + ([node_id] if node_id else [])
|
84 |
+
node_vector = create_vector(category, level, (start_line, end_line), total_lines, current_path)
|
85 |
parts.append({
|
86 |
+
'category': category,
|
87 |
'source': ''.join(stmt_lines),
|
88 |
'location': (start_line, end_line),
|
89 |
'level': level,
|
90 |
+
'vector': node_vector,
|
91 |
+
'parent_path': ' -> '.join(parent_path) if parent_path else 'Top-Level',
|
92 |
+
'node_id': node_id
|
93 |
})
|
94 |
|
95 |
+
# Process nested nodes (body and orelse)
|
96 |
+
nested_prev_end = end_line - 1
|
97 |
+
for attr in ('body', 'orelse'):
|
98 |
+
if hasattr(node, attr) and getattr(node, attr):
|
99 |
+
for child in getattr(node, attr):
|
100 |
+
child_parts = parse_node(child, lines, nested_prev_end, level + 1, total_lines, current_path, counters)
|
101 |
+
parts.extend(child_parts)
|
102 |
+
nested_prev_end = child_parts[-1]['location'][1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
return parts
|
105 |
|
|
|
110 |
try:
|
111 |
tree = ast.parse(code)
|
112 |
except SyntaxError:
|
113 |
+
return [{'category': 'error', 'source': 'Invalid Python code', 'location': (1, 1), 'level': 0, 'vector': [0, 0, 1.0, 1.0, 0, 0], 'parent_path': 'Top-Level', 'node_id': ''}]
|
114 |
|
115 |
parts = []
|
116 |
prev_end = 0
|
|
|
120 |
parts.extend(stmt_parts)
|
121 |
prev_end = stmt_parts[-1]['location'][1]
|
122 |
|
|
|
123 |
if prev_end < total_lines:
|
124 |
remaining_lines = lines[prev_end:]
|
125 |
+
spacer_vector = create_vector('spacer', 0, (prev_end + 1, total_lines + 1), total_lines, [])
|
126 |
parts.append({
|
127 |
'category': 'spacer',
|
128 |
'source': ''.join(remaining_lines),
|
129 |
'location': (prev_end + 1, total_lines + 1),
|
130 |
'level': 0,
|
131 |
+
'vector': spacer_vector,
|
132 |
+
'parent_path': 'Top-Level',
|
133 |
+
'node_id': ''
|
134 |
})
|
135 |
|
136 |
return parts
|