Spaces:

broadfield-dev
/

parse_py

Sleeping

File size: 4,044 Bytes

# database.py
import chromadb
from parser import parse_python_code
import os

def init_chromadb():
    # Initialize ChromaDB client (in-memory for now, can persist to disk)
    client = chromadb.Client()
    return client

def create_collection(client, collection_name="python_programs"):
    # Create or get a collection for Python programs
    try:
        collection = client.get_collection(name=collection_name)
    except:
        collection = client.create_collection(name=collection_name)
    return collection

def store_program(client, code, sequence, vectors, collection_name="python_programs"):
    # Create or get collection
    collection = create_collection(client, collection_name)
    
    # Store program data (ID, code, sequence, vectors)
    program_id = str(hash(code))  # Use hash of code as ID for uniqueness
    collection.add(
        documents=[code],
        metadatas=[{"sequence": ",".join(sequence)}],
        ids=[program_id],
        embeddings=[vectors]  # Store vectors as embeddings
    )
    return program_id

def populate_sample_db(client):
    # Sample programs for testing
    samples = [
        """
        import os
        def add_one(x):
            y = x + 1
            return y
        """,
        """
        def multiply(a, b):
            c = a * b
            if c > 0:
                return c
        """
    ]
    
    for code in samples:
        parts, sequence = parse_python_code(code)
        vectors = [part['vector'] for part in parts]
        store_program(client, code, sequence, vectors)

def query_programs(client, operations, collection_name="python_programs", top_k=5):
    """Query the database for programs matching the operations sequence."""
    collection = create_collection(client, collection_name)
    
    # Convert operations to a query vector (average of operation vectors)
    query_vector = sum([create_vector(op, 0, (1, 1), 100, []) for op in operations], []) / len(operations) if operations else [0, 0, 0, 0, 0, 0]
    
    # Perform similarity search
    results = collection.query(
        query_embeddings=[query_vector],
        n_results=top_k,
        include=["documents", "metadatas"]
    )
    
    # Process results
    matching_programs = []
    for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
        sequence = meta['sequence'].split(',')
        if is_subsequence(operations, sequence):
            similarity = cosine_similarity([query_vector], [np.mean(eval(doc['vectors']), axis=0) if doc['vectors'] else [0, 0, 0, 0, 0, 0]])[0][0]
            matching_programs.append({'id': meta['id'], 'code': doc, 'similarity': similarity})
    
    return sorted(matching_programs, key=lambda x: x['similarity'], reverse=True)

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def create_vector(category, level, location, total_lines, parent_path):
    """Helper to create a vector for query (matches parser's create_vector)."""
    category_map = {
        'import': 1, 'function': 2, 'async_function': 3, 'class': 4,
        'if': 5, 'while': 6, 'for': 7, 'try': 8, 'expression': 9, 'spacer': 10,
        'other': 11, 'elif': 12, 'else': 13, 'except': 14, 'finally': 15, 'return': 16,
        'assigned_variable': 17, 'input_variable': 18, 'returned_variable': 19
    }
    category_id = category_map.get(category, 0)
    start_line, end_line = location
    span = (end_line - start_line + 1) / total_lines
    center_pos = ((start_line + end_line) / 2) / total_lines
    parent_depth = len(parent_path)
    parent_weight = sum(category_map.get(parent.split('[')[0].lower(), 0) * (1 / (i + 1)) 
                        for i, parent in enumerate(parent_path)) / max(1, len(category_map))
    return [category_id, level, center_pos, span, parent_depth, parent_weight]

def is_subsequence(subseq, seq):
    """Check if subseq is a subsequence of seq."""
    it = iter(seq)
    return all(item in it for item in subseq)

if __name__ == '__main__':
    client = init_chromadb()
    populate_sample_db(client)