Spaces:
Running
Running
# database.py | |
import chromadb | |
from parser import parse_python_code | |
import os | |
def init_chromadb(): | |
# Initialize ChromaDB client (in-memory for now, can persist to disk) | |
client = chromadb.Client() | |
return client | |
def create_collection(client, collection_name="python_programs"): | |
# Create or get a collection for Python programs | |
try: | |
collection = client.get_collection(name=collection_name) | |
except: | |
collection = client.create_collection(name=collection_name) | |
return collection | |
def store_program(client, code, sequence, vectors, collection_name="python_programs"): | |
# Create or get collection | |
collection = create_collection(client, collection_name) | |
# Store program data (ID, code, sequence, vectors) | |
program_id = str(hash(code)) # Use hash of code as ID for uniqueness | |
collection.add( | |
documents=[code], | |
metadatas=[{"sequence": ",".join(sequence)}], | |
ids=[program_id], | |
embeddings=[vectors] # Store vectors as embeddings | |
) | |
return program_id | |
def populate_sample_db(client): | |
# Sample programs for testing | |
samples = [ | |
""" | |
import os | |
def add_one(x): | |
y = x + 1 | |
return y | |
""", | |
""" | |
def multiply(a, b): | |
c = a * b | |
if c > 0: | |
return c | |
""" | |
] | |
for code in samples: | |
parts, sequence = parse_python_code(code) | |
vectors = [part['vector'] for part in parts] | |
store_program(client, code, sequence, vectors) | |
def query_programs(client, operations, collection_name="python_programs", top_k=5): | |
"""Query the database for programs matching the operations sequence.""" | |
collection = create_collection(client, collection_name) | |
# Convert operations to a query vector (average of operation vectors) | |
query_vector = sum([create_vector(op, 0, (1, 1), 100, []) for op in operations], []) / len(operations) if operations else [0, 0, 0, 0, 0, 0] | |
# Perform similarity search | |
results = collection.query( | |
query_embeddings=[query_vector], | |
n_results=top_k, | |
include=["documents", "metadatas"] | |
) | |
# Process results | |
matching_programs = [] | |
for doc, meta in zip(results['documents'][0], results['metadatas'][0]): | |
sequence = meta['sequence'].split(',') | |
if is_subsequence(operations, sequence): | |
similarity = cosine_similarity([query_vector], [np.mean(eval(doc['vectors']), axis=0) if doc['vectors'] else [0, 0, 0, 0, 0, 0]])[0][0] | |
matching_programs.append({'id': meta['id'], 'code': doc, 'similarity': similarity}) | |
return sorted(matching_programs, key=lambda x: x['similarity'], reverse=True) | |
from sklearn.metrics.pairwise import cosine_similarity | |
import numpy as np | |
def create_vector(category, level, location, total_lines, parent_path): | |
"""Helper to create a vector for query (matches parser's create_vector).""" | |
category_map = { | |
'import': 1, 'function': 2, 'async_function': 3, 'class': 4, | |
'if': 5, 'while': 6, 'for': 7, 'try': 8, 'expression': 9, 'spacer': 10, | |
'other': 11, 'elif': 12, 'else': 13, 'except': 14, 'finally': 15, 'return': 16, | |
'assigned_variable': 17, 'input_variable': 18, 'returned_variable': 19 | |
} | |
category_id = category_map.get(category, 0) | |
start_line, end_line = location | |
span = (end_line - start_line + 1) / total_lines | |
center_pos = ((start_line + end_line) / 2) / total_lines | |
parent_depth = len(parent_path) | |
parent_weight = sum(category_map.get(parent.split('[')[0].lower(), 0) * (1 / (i + 1)) | |
for i, parent in enumerate(parent_path)) / max(1, len(category_map)) | |
return [category_id, level, center_pos, span, parent_depth, parent_weight] | |
def is_subsequence(subseq, seq): | |
"""Check if subseq is a subsequence of seq.""" | |
it = iter(seq) | |
return all(item in it for item in subseq) | |
if __name__ == '__main__': | |
client = init_chromadb() | |
populate_sample_db(client) |