File size: 4,430 Bytes
5a58620
 
 
 
bba9630
5a58620
 
 
 
 
 
 
 
 
 
217d002
 
 
 
 
 
5a58620
 
 
 
 
217d002
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bba9630
217d002
 
 
bba9630
 
 
 
 
 
 
217d002
bba9630
 
 
 
217d002
 
bba9630
 
 
 
217d002
bba9630
 
 
 
217d002
 
bba9630
5a58620
217d002
bba9630
 
 
217d002
bba9630
 
5a58620
217d002
 
 
 
 
 
 
 
bba9630
5a58620
bba9630
217d002
5a58620
217d002
bba9630
 
 
217d002
5a58620
 
 
 
 
217d002
bba9630
 
5a58620
bba9630
217d002
5a58620
217d002
5a58620
 
 
217d002
 
 
5a58620
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# parser.py
import ast

def get_category(node):
    """Determine the category of an AST node."""
    if isinstance(node, (ast.Import, ast.ImportFrom)):
        return 'import'
    elif isinstance(node, (ast.Assign, ast.AnnAssign, ast.AugAssign)):
        return 'assignment'
    elif isinstance(node, ast.FunctionDef):
        return 'function'
    elif isinstance(node, ast.AsyncFunctionDef):
        return 'async_function'
    elif isinstance(node, ast.ClassDef):
        return 'class'
    elif isinstance(node, ast.If):
        return 'if'
    elif isinstance(node, ast.While):
        return 'while'
    elif isinstance(node, ast.For):
        return 'for'
    elif isinstance(node, ast.Expr):
        return 'expression'
    else:
        return 'other'

def create_vector(category, level, location, total_lines):
    """Create a vector representation for a code part."""
    # Vector: [category_id, level, start_line_normalized, end_line_normalized]
    category_map = {
        'import': 1, 'assignment': 2, 'function': 3, 'async_function': 4, 'class': 5,
        'if': 6, 'while': 7, 'for': 8, 'expression': 9, 'spacer': 10, 'other': 11
    }
    category_id = category_map.get(category, 0)
    start_line, end_line = location
    return [
        category_id,
        level,
        start_line / total_lines,  # Normalized start position
        end_line / total_lines     # Normalized end position
    ]

def parse_node(node, lines, prev_end, level=0, total_lines=None):
    """Recursively parse an AST node and its children, assigning hierarchy levels."""
    if total_lines is None:
        total_lines = len(lines)

    parts = []
    start_line = getattr(node, 'lineno', prev_end + 1)
    end_line = getattr(node, 'end_lineno', start_line)

    # Handle spacers before the node
    if start_line > prev_end + 1:
        spacer_lines = lines[prev_end:start_line - 1]
        spacer_vector = create_vector('spacer', level, (prev_end + 1, start_line - 1), total_lines)
        parts.append({
            'category': 'spacer',
            'source': ''.join(spacer_lines),
            'location': (prev_end + 1, start_line - 1),
            'level': level,
            'vector': spacer_vector
        })

    # Capture the node's source
    stmt_lines = lines[start_line - 1:end_line]
    node_vector = create_vector(get_category(node), level, (start_line, end_line), total_lines)
    parts.append({
        'category': get_category(node),
        'source': ''.join(stmt_lines),
        'location': (start_line, end_line),
        'level': level,
        'vector': node_vector
    })

    # Process nested nodes (e.g., class/function bodies, control structures)
    if hasattr(node, 'body'):
        nested_prev_end = end_line - 1
        for child in node.body:
            child_parts = parse_node(child, lines, nested_prev_end, level + 1, total_lines)
            parts.extend(child_parts)
            nested_prev_end = child_parts[-1]['location'][1]

    # Handle additional bodies (e.g., else, elif, orelse for loops)
    if hasattr(node, 'orelse') and node.orelse:
        orelse_prev_end = parts[-1]['location'][1]
        for child in node.orelse:
            child_parts = parse_node(child, lines, orelse_prev_end, level + 1, total_lines)
            parts.extend(child_parts)
            orelse_prev_end = child_parts[-1]['location'][1]

    return parts

def parse_python_code(code):
    """Parse Python code string and return parts with hierarchy and vectors."""
    lines = code.splitlines(keepends=True)
    total_lines = len(lines)
    try:
        tree = ast.parse(code)
    except SyntaxError:
        return [{'category': 'error', 'source': 'Invalid Python code', 'location': (1, 1), 'level': 0, 'vector': [0, 0, 1.0, 1.0]}]

    parts = []
    prev_end = 0

    for stmt in tree.body:
        stmt_parts = parse_node(stmt, lines, prev_end, total_lines=total_lines)
        parts.extend(stmt_parts)
        prev_end = stmt_parts[-1]['location'][1]

    # Capture trailing spacers
    if prev_end < total_lines:
        remaining_lines = lines[prev_end:]
        spacer_vector = create_vector('spacer', 0, (prev_end + 1, total_lines + 1), total_lines)
        parts.append({
            'category': 'spacer',
            'source': ''.join(remaining_lines),
            'location': (prev_end + 1, total_lines + 1),
            'level': 0,
            'vector': spacer_vector
        })

    return parts