broadfield-dev commited on
Commit
a4492a3
·
verified ·
1 Parent(s): fa2db69

Update database.py

Browse files
Files changed (1) hide show
  1. database.py +171 -130
database.py CHANGED
@@ -8,118 +8,142 @@ from datasets import Dataset, load_dataset
8
  from transformers import AutoTokenizer, AutoModel
9
  import torch
10
  from dotenv import load_dotenv
 
 
 
 
 
11
 
12
  # Load environment variables
13
  load_dotenv()
14
 
15
- # User-configurable variables (no HF_KEY hardcoded here)
16
  DB_NAME = "python_programs" # ChromaDB collection name
17
  HF_DATASET_NAME = "python_program_vectors" # Hugging Face Dataset name
18
  PERSIST_DIR = "./chroma_data" # Directory for persistent storage (optional)
19
  USE_GPU = False # Default to CPU, set to True for GPU if available
20
 
21
  def init_chromadb(persist_dir=PERSIST_DIR):
22
- """Initialize ChromaDB client, optionally with persistent storage."""
23
  try:
24
  # Use persistent storage if directory exists, otherwise in-memory
25
  if os.path.exists(persist_dir):
 
26
  client = chromadb.PersistentClient(path=persist_dir)
27
  else:
 
28
  client = chromadb.Client()
29
  return client
30
  except Exception as e:
31
- print(f"Error initializing ChromaDB: {e}")
32
- return chromadb.Client() # Fallback to in-memory
33
 
34
  def create_collection(client, collection_name=DB_NAME):
35
- """Create or get a ChromaDB collection for Python programs."""
36
  try:
37
- collection = client.get_collection(name=collection_name)
38
- except:
39
- collection = client.create_collection(name=collection_name)
40
- return collection
 
 
41
 
42
  def store_program(client, code, sequence, vectors, collection_name=DB_NAME):
43
- """Store a program in ChromaDB with its code, sequence, and vectors."""
44
- collection = create_collection(client, collection_name)
45
-
46
- # Flatten vectors to ensure they are a list of numbers (ChromaDB expects flat embeddings)
47
- # Use the first vector (semantic vector) for ChromaDB embedding
48
- flattened_vectors = vectors[0] if vectors else [0] * 6 # Ensure 6D
49
-
50
- # Store program data (ID, code, sequence, vectors)
51
- program_id = str(hash(code)) # Use hash of code as ID for uniqueness
52
- collection.add(
53
- documents=[code],
54
- metadatas=[{"sequence": ",".join(sequence), "description_tokens": " ".join(generate_description_tokens(sequence, vectors)), "program_vectors": str(vectors)}],
55
- ids=[program_id],
56
- embeddings=[flattened_vectors] # Pass as 6D semantic vector
57
- )
58
- return program_id
 
 
 
 
 
59
 
60
  def populate_sample_db(client):
61
- """Populate ChromaDB with sample Python programs."""
62
- samples = [
63
- """
64
- import os
65
- def add_one(x):
66
- y = x + 1
67
- return y
68
- """,
69
- """
70
- def multiply(a, b):
71
- c = a * b
72
- if c > 0:
73
- return c
74
- """
75
- ]
76
-
77
- for code in samples:
78
- parts, sequence = parse_python_code(code)
79
- vectors = [part['vector'] for part in parts]
80
- store_program(client, code, sequence, vectors)
 
 
 
 
 
81
 
82
  def query_programs(client, operations, collection_name=DB_NAME, top_k=5, semantic_query=None):
83
- """Query ChromaDB for programs matching the operations sequence or semantic description."""
84
- collection = create_collection(client, collection_name)
85
-
86
- if semantic_query:
87
- # Semantic search using a 6D vector generated from the description
88
- query_vector = generate_semantic_vector(semantic_query)
89
- results = collection.query(
90
- query_embeddings=[query_vector],
91
- n_results=top_k,
92
- include=["documents", "metadatas"]
93
- )
94
- else:
95
- # Vector-based search for operations sequence
96
- query_vector = sum([create_vector(op, 0, (1, 1), 100, []) for op in operations], []) / len(operations) if operations else [0] * 6
97
- results = collection.query(
98
- query_embeddings=[query_vector],
99
- n_results=top_k,
100
- include=["documents", "metadatas"]
101
- )
102
-
103
- # Process results
104
- matching_programs = []
105
- for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
106
- sequence = meta['sequence'].split(',')
107
- if not semantic_query or is_subsequence(operations, sequence): # Ensure sequence match for operations
108
- try:
109
- # Reconstruct program vectors (flatten if needed)
110
- doc_vectors = eval(meta['program_vectors']) if isinstance(meta['program_vectors'], str) else meta['program_vectors']
111
- if isinstance(doc_vectors, (list, np.ndarray)) and len(doc_vectors) == 6:
112
- program_vector = doc_vectors # Single flat vector
113
- else:
114
- program_vector = np.mean([v for v in doc_vectors if isinstance(v, (list, np.ndarray))], axis=0).tolist()
115
- except:
116
- program_vector = [0] * 6 # Fallback for malformed vectors
117
- # Use the semantic embedding for similarity
118
- semantic_vector = eval(doc['vectors']) if isinstance(doc['vectors'], str) else doc['vectors']
119
- similarity = cosine_similarity([query_vector], [semantic_vector])[0][0] if semantic_vector and query_vector else 0
120
- matching_programs.append({'id': meta['id'], 'code': doc, 'similarity': similarity, 'description': meta.get('description_tokens', ''), 'program_vectors': meta.get('program_vectors', '[]')})
121
-
122
- return sorted(matching_programs, key=lambda x: x['similarity'], reverse=True)
 
 
 
 
 
123
 
124
  def create_vector(category, level, location, total_lines, parent_path):
125
  """Helper to create a vector for query (matches parser's create_vector)."""
@@ -168,13 +192,11 @@ def generate_description_tokens(sequence, vectors):
168
  tokens.append(f"span:{vec[3]:.2f}")
169
  return tokens
170
 
171
- def generate_semantic_vector(description, total_lines=100, use_gpu=USE_GPU):
172
  """Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D."""
173
- # Load CodeBERT model and tokenizer
174
- model_name = "microsoft/codebert-base"
175
- tokenizer = AutoTokenizer.from_pretrained(model_name)
176
- device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
177
- model = AutoModel.from_pretrained(model_name).to(device)
178
 
179
  # Tokenize and encode the description
180
  inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
@@ -192,54 +214,73 @@ def generate_semantic_vector(description, total_lines=100, use_gpu=USE_GPU):
192
  elif len(vector) > 6:
193
  vector = vector[:6] # Truncate to 6D
194
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  return vector
196
 
197
  def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
198
- """Save ChromaDB data to Hugging Face Dataset."""
199
- client = init_chromadb()
200
- collection = create_collection(client)
201
-
202
- # Fetch all data from ChromaDB
203
- results = collection.get(include=["documents", "metadatas", "embeddings"])
204
- data = {
205
- "code": results["documents"],
206
- "sequence": [meta["sequence"] for meta in results["metadatas"]],
207
- "vectors": results["embeddings"], # Semantic 6D vectors
208
- "description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]],
209
- "program_vectors": [eval(meta.get('program_vectors', '[]')) for meta in results["metadatas"]] # Store structural vectors
210
- }
211
-
212
- # Create a Hugging Face Dataset
213
- dataset = Dataset.from_dict(data)
214
-
215
- # Push to Hugging Face Hub
216
- dataset.push_to_hub(dataset_name, token=token)
217
- print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
 
 
 
 
218
 
219
  def load_chromadb_from_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
220
- """Load ChromaDB data from Hugging Face Dataset, handle empty dataset."""
221
  try:
222
  dataset = load_dataset(dataset_name, split="train", token=token)
 
 
 
 
 
 
 
223
  except Exception as e:
224
- print(f"Error loading dataset from Hugging Face: {e}. Populating with samples...")
 
225
  client = init_chromadb()
226
- populate_sample_db(client)
227
- save_chromadb_to_hf() # Create and push a new dataset
228
- return init_chromadb()
229
-
230
- client = init_chromadb()
231
- collection = create_collection(client)
232
-
233
- for item in dataset:
234
- collection.add(
235
- documents=[item["code"]],
236
- metadatas=[{"sequence": item["sequence"], "description_tokens": item["description_tokens"], "program_vectors": str(item["program_vectors"])}],
237
- ids=[str(hash(item["code"]))],
238
- embeddings=[item["vectors"]] # Use semantic 6D vectors
239
- )
240
- return client
241
 
242
  if __name__ == '__main__':
243
  client = load_chromadb_from_hf()
244
- # Uncomment to save to Hugging Face
245
- # save_chromadb_to_hf()
 
8
  from transformers import AutoTokenizer, AutoModel
9
  import torch
10
  from dotenv import load_dotenv
11
+ import logging
12
+
13
+ # Set up logging
14
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
15
+ logger = logging.getLogger(__name__)
16
 
17
  # Load environment variables
18
  load_dotenv()
19
 
20
+ # User-configurable variables
21
  DB_NAME = "python_programs" # ChromaDB collection name
22
  HF_DATASET_NAME = "python_program_vectors" # Hugging Face Dataset name
23
  PERSIST_DIR = "./chroma_data" # Directory for persistent storage (optional)
24
  USE_GPU = False # Default to CPU, set to True for GPU if available
25
 
26
  def init_chromadb(persist_dir=PERSIST_DIR):
27
+ """Initialize ChromaDB client, optionally with persistent storage, with error handling."""
28
  try:
29
  # Use persistent storage if directory exists, otherwise in-memory
30
  if os.path.exists(persist_dir):
31
+ logger.info(f"Initializing ChromaDB with persistent storage at {persist_dir}")
32
  client = chromadb.PersistentClient(path=persist_dir)
33
  else:
34
+ logger.info("Initializing ChromaDB with in-memory storage")
35
  client = chromadb.Client()
36
  return client
37
  except Exception as e:
38
+ logger.error(f"Error initializing ChromaDB: {e}")
39
+ raise
40
 
41
  def create_collection(client, collection_name=DB_NAME):
42
+ """Create or get a ChromaDB collection for Python programs, with error handling."""
43
  try:
44
+ collection = client.get_or_create_collection(name=collection_name)
45
+ logger.info(f"Using ChromaDB collection: {collection_name}")
46
+ return collection
47
+ except Exception as e:
48
+ logger.error(f"Error creating or getting collection {collection_name}: {e}")
49
+ raise
50
 
51
  def store_program(client, code, sequence, vectors, collection_name=DB_NAME):
52
+ """Store a program in ChromaDB with its code, sequence, and vectors, with error handling."""
53
+ try:
54
+ collection = create_collection(client, collection_name)
55
+
56
+ # Flatten vectors to ensure they are a list of numbers (ChromaDB expects flat embeddings)
57
+ # Use the first vector (semantic or program vector) for ChromaDB embedding, ensuring 6D
58
+ flattened_vectors = vectors[0] if vectors and len(vectors) > 0 and len(vectors[0]) == 6 else [0] * 6
59
+
60
+ # Store program data (ID, code, sequence, vectors)
61
+ program_id = str(hash(code)) # Use hash of code as ID for uniqueness
62
+ collection.add(
63
+ documents=[code],
64
+ metadatas=[{"sequence": ",".join(sequence), "description_tokens": " ".join(generate_description_tokens(sequence, vectors)), "program_vectors": str(vectors)}],
65
+ ids=[program_id],
66
+ embeddings=[flattened_vectors] # Pass as 6D vector
67
+ )
68
+ logger.info(f"Stored program in ChromaDB: {program_id}")
69
+ return program_id
70
+ except Exception as e:
71
+ logger.error(f"Error storing program in ChromaDB: {e}")
72
+ raise
73
 
74
  def populate_sample_db(client):
75
+ """Populate ChromaDB with sample Python programs, with logging."""
76
+ try:
77
+ samples = [
78
+ """
79
+ import os
80
+ def add_one(x):
81
+ y = x + 1
82
+ return y
83
+ """,
84
+ """
85
+ def multiply(a, b):
86
+ c = a * b
87
+ if c > 0:
88
+ return c
89
+ """
90
+ ]
91
+
92
+ for code in samples:
93
+ parts, sequence = parse_python_code(code)
94
+ vectors = [part['vector'] for part in parts]
95
+ store_program(client, code, sequence, vectors)
96
+ logger.info("Populated ChromaDB with sample programs")
97
+ except Exception as e:
98
+ logger.error(f"Error populating sample database: {e}")
99
+ raise
100
 
101
  def query_programs(client, operations, collection_name=DB_NAME, top_k=5, semantic_query=None):
102
+ """Query ChromaDB for programs matching the operations sequence or semantic description, with error handling."""
103
+ try:
104
+ collection = create_collection(client, collection_name)
105
+
106
+ if semantic_query:
107
+ # Semantic search using a 6D vector generated from the description
108
+ query_vector = generate_semantic_vector(semantic_query)
109
+ results = collection.query(
110
+ query_embeddings=[query_vector],
111
+ n_results=top_k,
112
+ include=["documents", "metadatas"]
113
+ )
114
+ else:
115
+ # Vector-based search for operations sequence
116
+ query_vector = sum([create_vector(op, 0, (1, 1), 100, []) for op in operations], []) / len(operations) if operations else [0] * 6
117
+ results = collection.query(
118
+ query_embeddings=[query_vector],
119
+ n_results=top_k,
120
+ include=["documents", "metadatas"]
121
+ )
122
+
123
+ # Process results
124
+ matching_programs = []
125
+ for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
126
+ sequence = meta['sequence'].split(',')
127
+ if not semantic_query or is_subsequence(operations, sequence): # Ensure sequence match for operations
128
+ try:
129
+ # Reconstruct program vectors (flatten if needed)
130
+ doc_vectors = eval(meta['program_vectors']) if isinstance(meta['program_vectors'], str) else meta['program_vectors']
131
+ if isinstance(doc_vectors, (list, np.ndarray)) and len(doc_vectors) == 6:
132
+ program_vector = doc_vectors # Single flat vector
133
+ else:
134
+ program_vector = np.mean([v for v in doc_vectors if isinstance(v, (list, np.ndarray))], axis=0).tolist()
135
+ except:
136
+ program_vector = [0] * 6 # Fallback for malformed vectors
137
+ # Use the semantic embedding for similarity
138
+ semantic_vector = eval(doc['vectors']) if isinstance(doc['vectors'], str) else doc['vectors']
139
+ similarity = cosine_similarity([query_vector], [semantic_vector])[0][0] if semantic_vector and query_vector else 0
140
+ matching_programs.append({'id': meta['id'], 'code': doc, 'similarity': similarity, 'description': meta.get('description_tokens', ''), 'program_vectors': meta.get('program_vectors', '[]')})
141
+
142
+ logger.info(f"Queried {len(matching_programs)} programs from ChromaDB")
143
+ return sorted(matching_programs, key=lambda x: x['similarity'], reverse=True)
144
+ except Exception as e:
145
+ logger.error(f"Error querying programs from ChromaDB: {e}")
146
+ raise
147
 
148
  def create_vector(category, level, location, total_lines, parent_path):
149
  """Helper to create a vector for query (matches parser's create_vector)."""
 
192
  tokens.append(f"span:{vec[3]:.2f}")
193
  return tokens
194
 
195
+ def generate_semantic_vector(description, total_lines=100, use_gpu=False):
196
  """Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D."""
197
+ global tokenizer, model, device
198
+ if tokenizer is None or model is None:
199
+ tokenizer, model, device = load_codebert_model(use_gpu)
 
 
200
 
201
  # Tokenize and encode the description
202
  inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
 
214
  elif len(vector) > 6:
215
  vector = vector[:6] # Truncate to 6D
216
 
217
+ # Ensure vector isn’t all zeros or defaults
218
+ if all(v == 0 for v in vector):
219
+ logger.warning(f"Default vector detected for description: {description}")
220
+ # Fallback: Use heuristic if CodeBERT fails to generate meaningful embeddings
221
+ category_map = {
222
+ 'import': 1, 'function': 2, 'assign': 17, 'input': 18, 'return': 19, 'if': 5, 'try': 8, 'except': 14
223
+ }
224
+ tokens = description.lower().split()
225
+ vector = [0] * 6
226
+ for token in tokens:
227
+ for cat, cat_id in category_map.items():
228
+ if cat in token:
229
+ vector[0] = cat_id # category_id
230
+ vector[1] = 1 # level
231
+ vector[2] = 0.5 # center_pos
232
+ vector[3] = 0.1 # span
233
+ vector[4] = 1 # parent_depth
234
+ vector[5] = cat_id / len(category_map) # parent_weight
235
+ break
236
+
237
+ logger.debug(f"Generated semantic vector for '{description}': {vector}")
238
  return vector
239
 
240
  def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
241
+ """Save ChromaDB data to Hugging Face Dataset, with error handling."""
242
+ try:
243
+ client = init_chromadb()
244
+ collection = client.get_collection(DB_NAME)
245
+
246
+ # Fetch all data from ChromaDB
247
+ results = collection.get(include=["documents", "metadatas", "embeddings"])
248
+ data = {
249
+ "code": results["documents"],
250
+ "sequence": [meta["sequence"] for meta in results["metadatas"]],
251
+ "vectors": results["embeddings"], # Semantic 6D vectors
252
+ "description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]],
253
+ "program_vectors": [eval(meta.get('program_vectors', '[]')) for meta in results["metadatas"]] # Store structural vectors
254
+ }
255
+
256
+ # Create a Hugging Face Dataset
257
+ dataset = Dataset.from_dict(data)
258
+
259
+ # Push to Hugging Face Hub
260
+ dataset.push_to_hub(dataset_name, token=token)
261
+ logger.info(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
262
+ except Exception as e:
263
+ logger.error(f"Error pushing dataset to Hugging Face Hub: {e}")
264
+ raise
265
 
266
  def load_chromadb_from_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
267
+ """Load ChromaDB data from Hugging Face Dataset, handle empty dataset, with error handling."""
268
  try:
269
  dataset = load_dataset(dataset_name, split="train", token=token)
270
+ client = init_chromadb()
271
+ collection = create_collection(client)
272
+
273
+ for item in dataset:
274
+ store_program(client, item["code"], item["sequence"].split(','), item["program_vectors"])
275
+ logger.info(f"Loaded {len(dataset)} entries from Hugging Face Hub into ChromaDB")
276
+ return client
277
  except Exception as e:
278
+ logger.error(f"Error loading dataset from Hugging Face: {e}")
279
+ # Fallback: Create empty collection
280
  client = init_chromadb()
281
+ create_collection(client)
282
+ return client
 
 
 
 
 
 
 
 
 
 
 
 
 
283
 
284
  if __name__ == '__main__':
285
  client = load_chromadb_from_hf()
286
+ logger.info("Database initialized or loaded from Hugging Face Hub")