Spaces:
Running
Running
Update database.py
Browse files- database.py +171 -130
database.py
CHANGED
@@ -8,118 +8,142 @@ from datasets import Dataset, load_dataset
|
|
8 |
from transformers import AutoTokenizer, AutoModel
|
9 |
import torch
|
10 |
from dotenv import load_dotenv
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
# Load environment variables
|
13 |
load_dotenv()
|
14 |
|
15 |
-
# User-configurable variables
|
16 |
DB_NAME = "python_programs" # ChromaDB collection name
|
17 |
HF_DATASET_NAME = "python_program_vectors" # Hugging Face Dataset name
|
18 |
PERSIST_DIR = "./chroma_data" # Directory for persistent storage (optional)
|
19 |
USE_GPU = False # Default to CPU, set to True for GPU if available
|
20 |
|
21 |
def init_chromadb(persist_dir=PERSIST_DIR):
|
22 |
-
"""Initialize ChromaDB client, optionally with persistent storage."""
|
23 |
try:
|
24 |
# Use persistent storage if directory exists, otherwise in-memory
|
25 |
if os.path.exists(persist_dir):
|
|
|
26 |
client = chromadb.PersistentClient(path=persist_dir)
|
27 |
else:
|
|
|
28 |
client = chromadb.Client()
|
29 |
return client
|
30 |
except Exception as e:
|
31 |
-
|
32 |
-
|
33 |
|
34 |
def create_collection(client, collection_name=DB_NAME):
|
35 |
-
"""Create or get a ChromaDB collection for Python programs."""
|
36 |
try:
|
37 |
-
collection = client.
|
38 |
-
|
39 |
-
collection
|
40 |
-
|
|
|
|
|
41 |
|
42 |
def store_program(client, code, sequence, vectors, collection_name=DB_NAME):
|
43 |
-
"""Store a program in ChromaDB with its code, sequence, and vectors."""
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
def populate_sample_db(client):
|
61 |
-
"""Populate ChromaDB with sample Python programs."""
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
def query_programs(client, operations, collection_name=DB_NAME, top_k=5, semantic_query=None):
|
83 |
-
"""Query ChromaDB for programs matching the operations sequence or semantic description."""
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
def create_vector(category, level, location, total_lines, parent_path):
|
125 |
"""Helper to create a vector for query (matches parser's create_vector)."""
|
@@ -168,13 +192,11 @@ def generate_description_tokens(sequence, vectors):
|
|
168 |
tokens.append(f"span:{vec[3]:.2f}")
|
169 |
return tokens
|
170 |
|
171 |
-
def generate_semantic_vector(description, total_lines=100, use_gpu=
|
172 |
"""Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D."""
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
|
177 |
-
model = AutoModel.from_pretrained(model_name).to(device)
|
178 |
|
179 |
# Tokenize and encode the description
|
180 |
inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
@@ -192,54 +214,73 @@ def generate_semantic_vector(description, total_lines=100, use_gpu=USE_GPU):
|
|
192 |
elif len(vector) > 6:
|
193 |
vector = vector[:6] # Truncate to 6D
|
194 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
return vector
|
196 |
|
197 |
def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
|
198 |
-
"""Save ChromaDB data to Hugging Face Dataset."""
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
|
|
|
|
|
|
|
|
218 |
|
219 |
def load_chromadb_from_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
|
220 |
-
"""Load ChromaDB data from Hugging Face Dataset, handle empty dataset."""
|
221 |
try:
|
222 |
dataset = load_dataset(dataset_name, split="train", token=token)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
except Exception as e:
|
224 |
-
|
|
|
225 |
client = init_chromadb()
|
226 |
-
|
227 |
-
|
228 |
-
return init_chromadb()
|
229 |
-
|
230 |
-
client = init_chromadb()
|
231 |
-
collection = create_collection(client)
|
232 |
-
|
233 |
-
for item in dataset:
|
234 |
-
collection.add(
|
235 |
-
documents=[item["code"]],
|
236 |
-
metadatas=[{"sequence": item["sequence"], "description_tokens": item["description_tokens"], "program_vectors": str(item["program_vectors"])}],
|
237 |
-
ids=[str(hash(item["code"]))],
|
238 |
-
embeddings=[item["vectors"]] # Use semantic 6D vectors
|
239 |
-
)
|
240 |
-
return client
|
241 |
|
242 |
if __name__ == '__main__':
|
243 |
client = load_chromadb_from_hf()
|
244 |
-
|
245 |
-
# save_chromadb_to_hf()
|
|
|
8 |
from transformers import AutoTokenizer, AutoModel
|
9 |
import torch
|
10 |
from dotenv import load_dotenv
|
11 |
+
import logging
|
12 |
+
|
13 |
+
# Set up logging
|
14 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
15 |
+
logger = logging.getLogger(__name__)
|
16 |
|
17 |
# Load environment variables
|
18 |
load_dotenv()
|
19 |
|
20 |
+
# User-configurable variables
|
21 |
DB_NAME = "python_programs" # ChromaDB collection name
|
22 |
HF_DATASET_NAME = "python_program_vectors" # Hugging Face Dataset name
|
23 |
PERSIST_DIR = "./chroma_data" # Directory for persistent storage (optional)
|
24 |
USE_GPU = False # Default to CPU, set to True for GPU if available
|
25 |
|
26 |
def init_chromadb(persist_dir=PERSIST_DIR):
|
27 |
+
"""Initialize ChromaDB client, optionally with persistent storage, with error handling."""
|
28 |
try:
|
29 |
# Use persistent storage if directory exists, otherwise in-memory
|
30 |
if os.path.exists(persist_dir):
|
31 |
+
logger.info(f"Initializing ChromaDB with persistent storage at {persist_dir}")
|
32 |
client = chromadb.PersistentClient(path=persist_dir)
|
33 |
else:
|
34 |
+
logger.info("Initializing ChromaDB with in-memory storage")
|
35 |
client = chromadb.Client()
|
36 |
return client
|
37 |
except Exception as e:
|
38 |
+
logger.error(f"Error initializing ChromaDB: {e}")
|
39 |
+
raise
|
40 |
|
41 |
def create_collection(client, collection_name=DB_NAME):
|
42 |
+
"""Create or get a ChromaDB collection for Python programs, with error handling."""
|
43 |
try:
|
44 |
+
collection = client.get_or_create_collection(name=collection_name)
|
45 |
+
logger.info(f"Using ChromaDB collection: {collection_name}")
|
46 |
+
return collection
|
47 |
+
except Exception as e:
|
48 |
+
logger.error(f"Error creating or getting collection {collection_name}: {e}")
|
49 |
+
raise
|
50 |
|
51 |
def store_program(client, code, sequence, vectors, collection_name=DB_NAME):
|
52 |
+
"""Store a program in ChromaDB with its code, sequence, and vectors, with error handling."""
|
53 |
+
try:
|
54 |
+
collection = create_collection(client, collection_name)
|
55 |
+
|
56 |
+
# Flatten vectors to ensure they are a list of numbers (ChromaDB expects flat embeddings)
|
57 |
+
# Use the first vector (semantic or program vector) for ChromaDB embedding, ensuring 6D
|
58 |
+
flattened_vectors = vectors[0] if vectors and len(vectors) > 0 and len(vectors[0]) == 6 else [0] * 6
|
59 |
+
|
60 |
+
# Store program data (ID, code, sequence, vectors)
|
61 |
+
program_id = str(hash(code)) # Use hash of code as ID for uniqueness
|
62 |
+
collection.add(
|
63 |
+
documents=[code],
|
64 |
+
metadatas=[{"sequence": ",".join(sequence), "description_tokens": " ".join(generate_description_tokens(sequence, vectors)), "program_vectors": str(vectors)}],
|
65 |
+
ids=[program_id],
|
66 |
+
embeddings=[flattened_vectors] # Pass as 6D vector
|
67 |
+
)
|
68 |
+
logger.info(f"Stored program in ChromaDB: {program_id}")
|
69 |
+
return program_id
|
70 |
+
except Exception as e:
|
71 |
+
logger.error(f"Error storing program in ChromaDB: {e}")
|
72 |
+
raise
|
73 |
|
74 |
def populate_sample_db(client):
|
75 |
+
"""Populate ChromaDB with sample Python programs, with logging."""
|
76 |
+
try:
|
77 |
+
samples = [
|
78 |
+
"""
|
79 |
+
import os
|
80 |
+
def add_one(x):
|
81 |
+
y = x + 1
|
82 |
+
return y
|
83 |
+
""",
|
84 |
+
"""
|
85 |
+
def multiply(a, b):
|
86 |
+
c = a * b
|
87 |
+
if c > 0:
|
88 |
+
return c
|
89 |
+
"""
|
90 |
+
]
|
91 |
+
|
92 |
+
for code in samples:
|
93 |
+
parts, sequence = parse_python_code(code)
|
94 |
+
vectors = [part['vector'] for part in parts]
|
95 |
+
store_program(client, code, sequence, vectors)
|
96 |
+
logger.info("Populated ChromaDB with sample programs")
|
97 |
+
except Exception as e:
|
98 |
+
logger.error(f"Error populating sample database: {e}")
|
99 |
+
raise
|
100 |
|
101 |
def query_programs(client, operations, collection_name=DB_NAME, top_k=5, semantic_query=None):
|
102 |
+
"""Query ChromaDB for programs matching the operations sequence or semantic description, with error handling."""
|
103 |
+
try:
|
104 |
+
collection = create_collection(client, collection_name)
|
105 |
+
|
106 |
+
if semantic_query:
|
107 |
+
# Semantic search using a 6D vector generated from the description
|
108 |
+
query_vector = generate_semantic_vector(semantic_query)
|
109 |
+
results = collection.query(
|
110 |
+
query_embeddings=[query_vector],
|
111 |
+
n_results=top_k,
|
112 |
+
include=["documents", "metadatas"]
|
113 |
+
)
|
114 |
+
else:
|
115 |
+
# Vector-based search for operations sequence
|
116 |
+
query_vector = sum([create_vector(op, 0, (1, 1), 100, []) for op in operations], []) / len(operations) if operations else [0] * 6
|
117 |
+
results = collection.query(
|
118 |
+
query_embeddings=[query_vector],
|
119 |
+
n_results=top_k,
|
120 |
+
include=["documents", "metadatas"]
|
121 |
+
)
|
122 |
+
|
123 |
+
# Process results
|
124 |
+
matching_programs = []
|
125 |
+
for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
|
126 |
+
sequence = meta['sequence'].split(',')
|
127 |
+
if not semantic_query or is_subsequence(operations, sequence): # Ensure sequence match for operations
|
128 |
+
try:
|
129 |
+
# Reconstruct program vectors (flatten if needed)
|
130 |
+
doc_vectors = eval(meta['program_vectors']) if isinstance(meta['program_vectors'], str) else meta['program_vectors']
|
131 |
+
if isinstance(doc_vectors, (list, np.ndarray)) and len(doc_vectors) == 6:
|
132 |
+
program_vector = doc_vectors # Single flat vector
|
133 |
+
else:
|
134 |
+
program_vector = np.mean([v for v in doc_vectors if isinstance(v, (list, np.ndarray))], axis=0).tolist()
|
135 |
+
except:
|
136 |
+
program_vector = [0] * 6 # Fallback for malformed vectors
|
137 |
+
# Use the semantic embedding for similarity
|
138 |
+
semantic_vector = eval(doc['vectors']) if isinstance(doc['vectors'], str) else doc['vectors']
|
139 |
+
similarity = cosine_similarity([query_vector], [semantic_vector])[0][0] if semantic_vector and query_vector else 0
|
140 |
+
matching_programs.append({'id': meta['id'], 'code': doc, 'similarity': similarity, 'description': meta.get('description_tokens', ''), 'program_vectors': meta.get('program_vectors', '[]')})
|
141 |
+
|
142 |
+
logger.info(f"Queried {len(matching_programs)} programs from ChromaDB")
|
143 |
+
return sorted(matching_programs, key=lambda x: x['similarity'], reverse=True)
|
144 |
+
except Exception as e:
|
145 |
+
logger.error(f"Error querying programs from ChromaDB: {e}")
|
146 |
+
raise
|
147 |
|
148 |
def create_vector(category, level, location, total_lines, parent_path):
|
149 |
"""Helper to create a vector for query (matches parser's create_vector)."""
|
|
|
192 |
tokens.append(f"span:{vec[3]:.2f}")
|
193 |
return tokens
|
194 |
|
195 |
+
def generate_semantic_vector(description, total_lines=100, use_gpu=False):
|
196 |
"""Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D."""
|
197 |
+
global tokenizer, model, device
|
198 |
+
if tokenizer is None or model is None:
|
199 |
+
tokenizer, model, device = load_codebert_model(use_gpu)
|
|
|
|
|
200 |
|
201 |
# Tokenize and encode the description
|
202 |
inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
|
|
214 |
elif len(vector) > 6:
|
215 |
vector = vector[:6] # Truncate to 6D
|
216 |
|
217 |
+
# Ensure vector isn’t all zeros or defaults
|
218 |
+
if all(v == 0 for v in vector):
|
219 |
+
logger.warning(f"Default vector detected for description: {description}")
|
220 |
+
# Fallback: Use heuristic if CodeBERT fails to generate meaningful embeddings
|
221 |
+
category_map = {
|
222 |
+
'import': 1, 'function': 2, 'assign': 17, 'input': 18, 'return': 19, 'if': 5, 'try': 8, 'except': 14
|
223 |
+
}
|
224 |
+
tokens = description.lower().split()
|
225 |
+
vector = [0] * 6
|
226 |
+
for token in tokens:
|
227 |
+
for cat, cat_id in category_map.items():
|
228 |
+
if cat in token:
|
229 |
+
vector[0] = cat_id # category_id
|
230 |
+
vector[1] = 1 # level
|
231 |
+
vector[2] = 0.5 # center_pos
|
232 |
+
vector[3] = 0.1 # span
|
233 |
+
vector[4] = 1 # parent_depth
|
234 |
+
vector[5] = cat_id / len(category_map) # parent_weight
|
235 |
+
break
|
236 |
+
|
237 |
+
logger.debug(f"Generated semantic vector for '{description}': {vector}")
|
238 |
return vector
|
239 |
|
240 |
def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
|
241 |
+
"""Save ChromaDB data to Hugging Face Dataset, with error handling."""
|
242 |
+
try:
|
243 |
+
client = init_chromadb()
|
244 |
+
collection = client.get_collection(DB_NAME)
|
245 |
+
|
246 |
+
# Fetch all data from ChromaDB
|
247 |
+
results = collection.get(include=["documents", "metadatas", "embeddings"])
|
248 |
+
data = {
|
249 |
+
"code": results["documents"],
|
250 |
+
"sequence": [meta["sequence"] for meta in results["metadatas"]],
|
251 |
+
"vectors": results["embeddings"], # Semantic 6D vectors
|
252 |
+
"description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]],
|
253 |
+
"program_vectors": [eval(meta.get('program_vectors', '[]')) for meta in results["metadatas"]] # Store structural vectors
|
254 |
+
}
|
255 |
+
|
256 |
+
# Create a Hugging Face Dataset
|
257 |
+
dataset = Dataset.from_dict(data)
|
258 |
+
|
259 |
+
# Push to Hugging Face Hub
|
260 |
+
dataset.push_to_hub(dataset_name, token=token)
|
261 |
+
logger.info(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
|
262 |
+
except Exception as e:
|
263 |
+
logger.error(f"Error pushing dataset to Hugging Face Hub: {e}")
|
264 |
+
raise
|
265 |
|
266 |
def load_chromadb_from_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
|
267 |
+
"""Load ChromaDB data from Hugging Face Dataset, handle empty dataset, with error handling."""
|
268 |
try:
|
269 |
dataset = load_dataset(dataset_name, split="train", token=token)
|
270 |
+
client = init_chromadb()
|
271 |
+
collection = create_collection(client)
|
272 |
+
|
273 |
+
for item in dataset:
|
274 |
+
store_program(client, item["code"], item["sequence"].split(','), item["program_vectors"])
|
275 |
+
logger.info(f"Loaded {len(dataset)} entries from Hugging Face Hub into ChromaDB")
|
276 |
+
return client
|
277 |
except Exception as e:
|
278 |
+
logger.error(f"Error loading dataset from Hugging Face: {e}")
|
279 |
+
# Fallback: Create empty collection
|
280 |
client = init_chromadb()
|
281 |
+
create_collection(client)
|
282 |
+
return client
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
283 |
|
284 |
if __name__ == '__main__':
|
285 |
client = load_chromadb_from_hf()
|
286 |
+
logger.info("Database initialized or loaded from Hugging Face Hub")
|
|