# database.py import chromadb from parser import parse_python_code, create_vector import os from sklearn.metrics.pairwise import cosine_similarity import numpy as np from datasets import Dataset, load_dataset from transformers import AutoTokenizer, AutoModel import torch from dotenv import load_dotenv # Load environment variables load_dotenv() # User-configurable variables (no HF_KEY hardcoded here) DB_NAME = "python_programs" # ChromaDB collection name HF_DATASET_NAME = "python_program_vectors" # Hugging Face Dataset name PERSIST_DIR = "./chroma_data" # Directory for persistent storage (optional) USE_GPU = False # Default to CPU, set to True for GPU if available def init_chromadb(persist_dir=PERSIST_DIR): """Initialize ChromaDB client, optionally with persistent storage.""" try: # Use persistent storage if directory exists, otherwise in-memory if os.path.exists(persist_dir): client = chromadb.PersistentClient(path=persist_dir) else: client = chromadb.Client() return client except Exception as e: print(f"Error initializing ChromaDB: {e}") return chromadb.Client() # Fallback to in-memory def create_collection(client, collection_name=DB_NAME): """Create or get a ChromaDB collection for Python programs.""" try: collection = client.get_collection(name=collection_name) except: collection = client.create_collection(name=collection_name) return collection def store_program(client, code, sequence, vectors, collection_name=DB_NAME): """Store a program in ChromaDB with its code, sequence, and vectors.""" collection = create_collection(client, collection_name) # Flatten vectors to ensure they are a list of numbers (ChromaDB expects flat embeddings) flattened_vectors = [item for sublist in vectors for item in sublist] # Store program data (ID, code, sequence, vectors) program_id = str(hash(code)) # Use hash of code as ID for uniqueness collection.add( documents=[code], metadatas=[{"sequence": ",".join(sequence), "description_tokens": " ".join(generate_description_tokens(sequence, vectors))}], ids=[program_id], embeddings=[flattened_vectors] # Pass as flat list ) return program_id def populate_sample_db(client): """Populate ChromaDB with sample Python programs.""" samples = [ """ import os def add_one(x): y = x + 1 return y """, """ def multiply(a, b): c = a * b if c > 0: return c """ ] for code in samples: parts, sequence = parse_python_code(code) vectors = [part['vector'] for part in parts] store_program(client, code, sequence, vectors) def query_programs(client, operations, collection_name=DB_NAME, top_k=5, semantic_query=None): """Query ChromaDB for programs matching the operations sequence or semantic description.""" collection = create_collection(client, collection_name) if semantic_query: # Semantic search using a 6D vector generated from the description query_vector = generate_semantic_vector(semantic_query) results = collection.query( query_embeddings=[query_vector], n_results=top_k, include=["documents", "metadatas"] ) else: # Vector-based search for operations sequence query_vector = sum([create_vector(op, 0, (1, 1), 100, []) for op in operations], []) / len(operations) if operations else [0] * 6 results = collection.query( query_embeddings=[query_vector], n_results=top_k, include=["documents", "metadatas"] ) # Process results matching_programs = [] for doc, meta in zip(results['documents'][0], results['metadatas'][0]): sequence = meta['sequence'].split(',') if not semantic_query or is_subsequence(operations, sequence): # Ensure sequence match for operations try: # Reconstruct program vectors (flatten if needed) doc_vectors = eval(doc['vectors']) if isinstance(doc['vectors'], str) else doc['vectors'] if isinstance(doc_vectors, (list, np.ndarray)) and len(doc_vectors) == 6: program_vector = doc_vectors # Single flat vector else: program_vector = np.mean([v for v in doc_vectors if isinstance(v, (list, np.ndarray))], axis=0).tolist() except: program_vector = [0] * 6 # Fallback for malformed vectors similarity = cosine_similarity([query_vector], [program_vector])[0][0] if program_vector and query_vector else 0 matching_programs.append({'id': meta['id'], 'code': doc, 'similarity': similarity, 'description': meta.get('description_tokens', '')}) return sorted(matching_programs, key=lambda x: x['similarity'], reverse=True) def create_vector(category, level, location, total_lines, parent_path): """Helper to create a vector for query (matches parser's create_vector).""" category_map = { 'import': 1, 'function': 2, 'async_function': 3, 'class': 4, 'if': 5, 'while': 6, 'for': 7, 'try': 8, 'expression': 9, 'spacer': 10, 'other': 11, 'elif': 12, 'else': 13, 'except': 14, 'finally': 15, 'return': 16, 'assigned_variable': 17, 'input_variable': 18, 'returned_variable': 19 } category_id = category_map.get(category, 0) start_line, end_line = location span = (end_line - start_line + 1) / total_lines center_pos = ((start_line + end_line) / 2) / total_lines parent_depth = len(parent_path) parent_weight = sum(category_map.get(parent.split('[')[0].lower(), 0) * (1 / (i + 1)) for i, parent in enumerate(parent_path)) / max(1, len(category_map)) return [category_id, level, center_pos, span, parent_depth, parent_weight] def is_subsequence(subseq, seq): """Check if subseq is a subsequence of seq.""" it = iter(seq) return all(item in it for item in subseq) def generate_description_tokens(sequence, vectors): """Generate semantic description tokens for a program based on its sequence and vectors.""" tokens = [] category_descriptions = { 'import': 'imports module', 'function': 'defines function', 'assigned_variable': 'assigns variable', 'input_variable': 'input parameter', 'returned_variable': 'returns value', 'if': 'conditional statement', 'return': 'returns result', 'try': 'try block', 'except': 'exception handler', 'expression': 'expression statement', 'spacer': 'empty line or comment' } for cat, vec in zip(sequence, vectors): if cat in category_descriptions: tokens.append(f"{category_descriptions[cat]}:{cat}") # Add vector-derived features (e.g., level, span) as tokens tokens.append(f"level:{vec[1]}") tokens.append(f"span:{vec[3]:.2f}") return tokens def generate_semantic_vector(description, total_lines=100): """Generate a 6D semantic vector for a textual description, matching our vector format.""" # Use a simplified heuristic to map description to our 6D vector format category_map = { 'import': 1, 'function': 2, 'assign': 17, 'input': 18, 'return': 19, 'if': 5, 'try': 8, 'except': 14 } # Parse description for key terms tokens = description.lower().split() vector = [0] * 6 # Initialize 6D vector # Map description tokens to categories and assign basic vector values for token in tokens: for cat, cat_id in category_map.items(): if cat in token: vector[0] = cat_id # category_id vector[1] = 1 # level (assume top-level for simplicity) vector[2] = 0.5 # center_pos (midpoint of code) vector[3] = 0.1 # span (small for simplicity) vector[4] = 1 # parent_depth (shallow) vector[5] = cat_id / len(category_map) # parent_weight (normalized) break return vector def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")): """Save ChromaDB data to Hugging Face Dataset.""" client = init_chromadb() collection = create_collection(client) # Fetch all data from ChromaDB results = collection.get(include=["documents", "metadatas", "embeddings"]) data = { "code": results["documents"], "sequence": [meta["sequence"] for meta in results["metadatas"]], "vectors": results["embeddings"], # ChromaDB already flattens embeddings "description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]] } # Create a Hugging Face Dataset dataset = Dataset.from_dict(data) # Push to Hugging Face Hub dataset.push_to_hub(dataset_name, token=token) print(f"Dataset pushed to Hugging Face Hub as {dataset_name}") def load_chromadb_from_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")): """Load ChromaDB data from Hugging Face Dataset, handle empty dataset.""" try: dataset = load_dataset(dataset_name, split="train", token=token) except Exception as e: print(f"Error loading dataset from Hugging Face: {e}. Populating with samples...") client = init_chromadb() populate_sample_db(client) save_chromadb_to_hf() # Create and push a new dataset return init_chromadb() client = init_chromadb() collection = create_collection(client) for item in dataset: collection.add( documents=[item["code"]], metadatas=[{"sequence": item["sequence"], "description_tokens": item["description_tokens"]}], ids=[str(hash(item["code"]))], embeddings=[item["vectors"]] ) return client if __name__ == '__main__': client = load_chromadb_from_hf() # Uncomment to save to Hugging Face # save_chromadb_to_hf()