broadfield-dev commited on
Commit
275730d
·
verified ·
1 Parent(s): 506d255

Update database.py

Browse files
Files changed (1) hide show
  1. database.py +15 -36
database.py CHANGED
@@ -44,15 +44,16 @@ def store_program(client, code, sequence, vectors, collection_name=DB_NAME):
44
  collection = create_collection(client, collection_name)
45
 
46
  # Flatten vectors to ensure they are a list of numbers (ChromaDB expects flat embeddings)
47
- flattened_vectors = [item for sublist in vectors for item in sublist]
 
48
 
49
  # Store program data (ID, code, sequence, vectors)
50
  program_id = str(hash(code)) # Use hash of code as ID for uniqueness
51
  collection.add(
52
  documents=[code],
53
- metadatas=[{"sequence": ",".join(sequence), "description_tokens": " ".join(generate_description_tokens(sequence, vectors))}],
54
  ids=[program_id],
55
- embeddings=[flattened_vectors] # Pass as flat list
56
  )
57
  return program_id
58
 
@@ -106,15 +107,17 @@ def query_programs(client, operations, collection_name=DB_NAME, top_k=5, semanti
106
  if not semantic_query or is_subsequence(operations, sequence): # Ensure sequence match for operations
107
  try:
108
  # Reconstruct program vectors (flatten if needed)
109
- doc_vectors = eval(doc['vectors']) if isinstance(doc['vectors'], str) else doc['vectors']
110
  if isinstance(doc_vectors, (list, np.ndarray)) and len(doc_vectors) == 6:
111
  program_vector = doc_vectors # Single flat vector
112
  else:
113
  program_vector = np.mean([v for v in doc_vectors if isinstance(v, (list, np.ndarray))], axis=0).tolist()
114
  except:
115
  program_vector = [0] * 6 # Fallback for malformed vectors
116
- similarity = cosine_similarity([query_vector], [program_vector])[0][0] if program_vector and query_vector else 0
117
- matching_programs.append({'id': meta['id'], 'code': doc, 'similarity': similarity, 'description': meta.get('description_tokens', '')})
 
 
118
 
119
  return sorted(matching_programs, key=lambda x: x['similarity'], reverse=True)
120
 
@@ -165,32 +168,7 @@ def generate_description_tokens(sequence, vectors):
165
  tokens.append(f"span:{vec[3]:.2f}")
166
  return tokens
167
 
168
- def generate_semantic_vector_og(description, total_lines=100):
169
- """Generate a 6D semantic vector for a textual description, matching our vector format."""
170
- # Use a simplified heuristic to map description to our 6D vector format
171
- category_map = {
172
- 'import': 1, 'function': 2, 'assign': 17, 'input': 18, 'return': 19, 'if': 5, 'try': 8, 'except': 14
173
- }
174
-
175
- # Parse description for key terms
176
- tokens = description.lower().split()
177
- vector = [0] * 6 # Initialize 6D vector
178
-
179
- # Map description tokens to categories and assign basic vector values
180
- for token in tokens:
181
- for cat, cat_id in category_map.items():
182
- if cat in token:
183
- vector[0] = cat_id # category_id
184
- vector[1] = 1 # level (assume top-level for simplicity)
185
- vector[2] = 0.5 # center_pos (midpoint of code)
186
- vector[3] = 0.1 # span (small for simplicity)
187
- vector[4] = 1 # parent_depth (shallow)
188
- vector[5] = cat_id / len(category_map) # parent_weight (normalized)
189
- break
190
-
191
- return vector
192
-
193
- def generate_semantic_vector(description, total_lines=100, use_gpu=False):
194
  """Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D."""
195
  # Load CodeBERT model and tokenizer
196
  model_name = "microsoft/codebert-base"
@@ -226,8 +204,9 @@ def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY"))
226
  data = {
227
  "code": results["documents"],
228
  "sequence": [meta["sequence"] for meta in results["metadatas"]],
229
- "vectors": results["embeddings"], # ChromaDB already flattens embeddings
230
- "description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]]
 
231
  }
232
 
233
  # Create a Hugging Face Dataset
@@ -254,9 +233,9 @@ def load_chromadb_from_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY"
254
  for item in dataset:
255
  collection.add(
256
  documents=[item["code"]],
257
- metadatas=[{"sequence": item["sequence"], "description_tokens": item["description_tokens"]}],
258
  ids=[str(hash(item["code"]))],
259
- embeddings=[item["vectors"]]
260
  )
261
  return client
262
 
 
44
  collection = create_collection(client, collection_name)
45
 
46
  # Flatten vectors to ensure they are a list of numbers (ChromaDB expects flat embeddings)
47
+ # Use the first vector (semantic vector) for ChromaDB embedding
48
+ flattened_vectors = vectors[0] if vectors else [0] * 6 # Ensure 6D
49
 
50
  # Store program data (ID, code, sequence, vectors)
51
  program_id = str(hash(code)) # Use hash of code as ID for uniqueness
52
  collection.add(
53
  documents=[code],
54
+ metadatas=[{"sequence": ",".join(sequence), "description_tokens": " ".join(generate_description_tokens(sequence, vectors)), "program_vectors": str(vectors)}],
55
  ids=[program_id],
56
+ embeddings=[flattened_vectors] # Pass as 6D semantic vector
57
  )
58
  return program_id
59
 
 
107
  if not semantic_query or is_subsequence(operations, sequence): # Ensure sequence match for operations
108
  try:
109
  # Reconstruct program vectors (flatten if needed)
110
+ doc_vectors = eval(meta['program_vectors']) if isinstance(meta['program_vectors'], str) else meta['program_vectors']
111
  if isinstance(doc_vectors, (list, np.ndarray)) and len(doc_vectors) == 6:
112
  program_vector = doc_vectors # Single flat vector
113
  else:
114
  program_vector = np.mean([v for v in doc_vectors if isinstance(v, (list, np.ndarray))], axis=0).tolist()
115
  except:
116
  program_vector = [0] * 6 # Fallback for malformed vectors
117
+ # Use the semantic embedding for similarity
118
+ semantic_vector = eval(doc['vectors']) if isinstance(doc['vectors'], str) else doc['vectors']
119
+ similarity = cosine_similarity([query_vector], [semantic_vector])[0][0] if semantic_vector and query_vector else 0
120
+ matching_programs.append({'id': meta['id'], 'code': doc, 'similarity': similarity, 'description': meta.get('description_tokens', ''), 'program_vectors': meta.get('program_vectors', '[]')})
121
 
122
  return sorted(matching_programs, key=lambda x: x['similarity'], reverse=True)
123
 
 
168
  tokens.append(f"span:{vec[3]:.2f}")
169
  return tokens
170
 
171
+ def generate_semantic_vector(description, total_lines=100, use_gpu=USE_GPU):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  """Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D."""
173
  # Load CodeBERT model and tokenizer
174
  model_name = "microsoft/codebert-base"
 
204
  data = {
205
  "code": results["documents"],
206
  "sequence": [meta["sequence"] for meta in results["metadatas"]],
207
+ "vectors": results["embeddings"], # Semantic 6D vectors
208
+ "description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]],
209
+ "program_vectors": [eval(meta.get('program_vectors', '[]')) for meta in results["metadatas"]] # Store structural vectors
210
  }
211
 
212
  # Create a Hugging Face Dataset
 
233
  for item in dataset:
234
  collection.add(
235
  documents=[item["code"]],
236
+ metadatas=[{"sequence": item["sequence"], "description_tokens": item["description_tokens"], "program_vectors": str(item["program_vectors"])}],
237
  ids=[str(hash(item["code"]))],
238
+ embeddings=[item["vectors"]] # Use semantic 6D vectors
239
  )
240
  return client
241