broadfield-dev commited on
Commit
4039137
·
verified ·
1 Parent(s): dd6ee14

Update parser.py

Browse files
Files changed (1) hide show
  1. parser.py +45 -31
parser.py CHANGED
@@ -24,69 +24,82 @@ def get_category(node):
24
  else:
25
  return 'other'
26
 
27
- def create_vector(category, level, location, total_lines):
28
- """Create a vector representation for a code part."""
29
- # Vector: [category_id, level, start_line_normalized, end_line_normalized]
30
  category_map = {
31
  'import': 1, 'assignment': 2, 'function': 3, 'async_function': 4, 'class': 5,
32
  'if': 6, 'while': 7, 'for': 8, 'expression': 9, 'spacer': 10, 'other': 11
33
  }
34
  category_id = category_map.get(category, 0)
35
  start_line, end_line = location
 
 
 
36
  return [
37
  category_id,
38
  level,
39
- start_line / total_lines, # Normalized start position
40
- end_line / total_lines # Normalized end position
 
 
41
  ]
42
 
43
- def parse_node(node, lines, prev_end, level=0, total_lines=None):
44
- """Recursively parse an AST node and its children, assigning hierarchy levels."""
45
  if total_lines is None:
46
  total_lines = len(lines)
 
 
 
 
47
 
48
  parts = []
49
  start_line = getattr(node, 'lineno', prev_end + 1)
50
  end_line = getattr(node, 'end_lineno', start_line)
 
 
 
 
 
 
 
51
 
52
  # Handle spacers before the node
53
  if start_line > prev_end + 1:
54
  spacer_lines = lines[prev_end:start_line - 1]
55
- spacer_vector = create_vector('spacer', level, (prev_end + 1, start_line - 1), total_lines)
56
  parts.append({
57
  'category': 'spacer',
58
  'source': ''.join(spacer_lines),
59
  'location': (prev_end + 1, start_line - 1),
60
  'level': level,
61
- 'vector': spacer_vector
 
 
62
  })
63
 
64
  # Capture the node's source
65
  stmt_lines = lines[start_line - 1:end_line]
66
- node_vector = create_vector(get_category(node), level, (start_line, end_line), total_lines)
 
67
  parts.append({
68
- 'category': get_category(node),
69
  'source': ''.join(stmt_lines),
70
  'location': (start_line, end_line),
71
  'level': level,
72
- 'vector': node_vector
 
 
73
  })
74
 
75
- # Process nested nodes (e.g., class/function bodies, control structures)
76
- if hasattr(node, 'body'):
77
- nested_prev_end = end_line - 1
78
- for child in node.body:
79
- child_parts = parse_node(child, lines, nested_prev_end, level + 1, total_lines)
80
- parts.extend(child_parts)
81
- nested_prev_end = child_parts[-1]['location'][1]
82
-
83
- # Handle additional bodies (e.g., else, elif, orelse for loops)
84
- if hasattr(node, 'orelse') and node.orelse:
85
- orelse_prev_end = parts[-1]['location'][1]
86
- for child in node.orelse:
87
- child_parts = parse_node(child, lines, orelse_prev_end, level + 1, total_lines)
88
- parts.extend(child_parts)
89
- orelse_prev_end = child_parts[-1]['location'][1]
90
 
91
  return parts
92
 
@@ -97,7 +110,7 @@ def parse_python_code(code):
97
  try:
98
  tree = ast.parse(code)
99
  except SyntaxError:
100
- return [{'category': 'error', 'source': 'Invalid Python code', 'location': (1, 1), 'level': 0, 'vector': [0, 0, 1.0, 1.0]}]
101
 
102
  parts = []
103
  prev_end = 0
@@ -107,16 +120,17 @@ def parse_python_code(code):
107
  parts.extend(stmt_parts)
108
  prev_end = stmt_parts[-1]['location'][1]
109
 
110
- # Capture trailing spacers
111
  if prev_end < total_lines:
112
  remaining_lines = lines[prev_end:]
113
- spacer_vector = create_vector('spacer', 0, (prev_end + 1, total_lines + 1), total_lines)
114
  parts.append({
115
  'category': 'spacer',
116
  'source': ''.join(remaining_lines),
117
  'location': (prev_end + 1, total_lines + 1),
118
  'level': 0,
119
- 'vector': spacer_vector
 
 
120
  })
121
 
122
  return parts
 
24
  else:
25
  return 'other'
26
 
27
+ def create_vector(category, level, location, total_lines, parent_path):
28
+ """Create a vector representation including hierarchy info."""
 
29
  category_map = {
30
  'import': 1, 'assignment': 2, 'function': 3, 'async_function': 4, 'class': 5,
31
  'if': 6, 'while': 7, 'for': 8, 'expression': 9, 'spacer': 10, 'other': 11
32
  }
33
  category_id = category_map.get(category, 0)
34
  start_line, end_line = location
35
+ # Extend vector with parent path length and a hash of the path for uniqueness
36
+ parent_depth = len(parent_path)
37
+ parent_hash = hash(' -> '.join(parent_path)) % 1000 # Simple hash for vector
38
  return [
39
  category_id,
40
  level,
41
+ start_line / total_lines,
42
+ end_line / total_lines,
43
+ parent_depth,
44
+ parent_hash / 1000 # Normalized hash (0.0 to 1.0)
45
  ]
46
 
47
+ def parse_node(node, lines, prev_end, level=0, total_lines=None, parent_path=None, counters=None):
48
+ """Recursively parse an AST node with full hierarchy tracking."""
49
  if total_lines is None:
50
  total_lines = len(lines)
51
+ if parent_path is None:
52
+ parent_path = []
53
+ if counters is None:
54
+ counters = {'if': 0, 'while': 0, 'for': 0, 'function': 0, 'class': 0} # Track counts per type
55
 
56
  parts = []
57
  start_line = getattr(node, 'lineno', prev_end + 1)
58
  end_line = getattr(node, 'end_lineno', start_line)
59
+ category = get_category(node)
60
+
61
+ # Assign a node ID for relevant categories
62
+ node_id = ''
63
+ if category in counters:
64
+ counters[category] += 1
65
+ node_id = f"{category.capitalize()}[{counters[category]}]"
66
 
67
  # Handle spacers before the node
68
  if start_line > prev_end + 1:
69
  spacer_lines = lines[prev_end:start_line - 1]
70
+ spacer_vector = create_vector('spacer', level, (prev_end + 1, start_line - 1), total_lines, parent_path)
71
  parts.append({
72
  'category': 'spacer',
73
  'source': ''.join(spacer_lines),
74
  'location': (prev_end + 1, start_line - 1),
75
  'level': level,
76
+ 'vector': spacer_vector,
77
+ 'parent_path': ' -> '.join(parent_path) if parent_path else 'Top-Level',
78
+ 'node_id': ''
79
  })
80
 
81
  # Capture the node's source
82
  stmt_lines = lines[start_line - 1:end_line]
83
+ current_path = parent_path + ([node_id] if node_id else [])
84
+ node_vector = create_vector(category, level, (start_line, end_line), total_lines, current_path)
85
  parts.append({
86
+ 'category': category,
87
  'source': ''.join(stmt_lines),
88
  'location': (start_line, end_line),
89
  'level': level,
90
+ 'vector': node_vector,
91
+ 'parent_path': ' -> '.join(parent_path) if parent_path else 'Top-Level',
92
+ 'node_id': node_id
93
  })
94
 
95
+ # Process nested nodes (body and orelse)
96
+ nested_prev_end = end_line - 1
97
+ for attr in ('body', 'orelse'):
98
+ if hasattr(node, attr) and getattr(node, attr):
99
+ for child in getattr(node, attr):
100
+ child_parts = parse_node(child, lines, nested_prev_end, level + 1, total_lines, current_path, counters)
101
+ parts.extend(child_parts)
102
+ nested_prev_end = child_parts[-1]['location'][1]
 
 
 
 
 
 
 
103
 
104
  return parts
105
 
 
110
  try:
111
  tree = ast.parse(code)
112
  except SyntaxError:
113
+ return [{'category': 'error', 'source': 'Invalid Python code', 'location': (1, 1), 'level': 0, 'vector': [0, 0, 1.0, 1.0, 0, 0], 'parent_path': 'Top-Level', 'node_id': ''}]
114
 
115
  parts = []
116
  prev_end = 0
 
120
  parts.extend(stmt_parts)
121
  prev_end = stmt_parts[-1]['location'][1]
122
 
 
123
  if prev_end < total_lines:
124
  remaining_lines = lines[prev_end:]
125
+ spacer_vector = create_vector('spacer', 0, (prev_end + 1, total_lines + 1), total_lines, [])
126
  parts.append({
127
  'category': 'spacer',
128
  'source': ''.join(remaining_lines),
129
  'location': (prev_end + 1, total_lines + 1),
130
  'level': 0,
131
+ 'vector': spacer_vector,
132
+ 'parent_path': 'Top-Level',
133
+ 'node_id': ''
134
  })
135
 
136
  return parts