broadfield-dev commited on
Commit
065607f
·
verified ·
1 Parent(s): 0d45c9f

Update database.py

Browse files
Files changed (1) hide show
  1. database.py +82 -26
database.py CHANGED
@@ -4,12 +4,12 @@ from parser import parse_python_code
4
  import os
5
  from sklearn.metrics.pairwise import cosine_similarity
6
  import numpy as np
7
- import dotenv
8
 
9
  # User-configurable variables
10
  DB_NAME = "python_programs" # ChromaDB collection name
11
- HF_DATASET_NAME = "broadfield-dev/python_program_vectors" # Hugging Face Dataset name
12
- HF_TOKEN = os.getenv("HF_KEY") # Replace with your Hugging Face API token
13
  PERSIST_DIR = "./chroma_data" # Directory for persistent storage (optional)
14
 
15
  def init_chromadb(persist_dir=PERSIST_DIR):
@@ -44,7 +44,7 @@ def store_program(client, code, sequence, vectors, collection_name=DB_NAME):
44
  program_id = str(hash(code)) # Use hash of code as ID for uniqueness
45
  collection.add(
46
  documents=[code],
47
- metadatas=[{"sequence": ",".join(sequence)}],
48
  ids=[program_id],
49
  embeddings=[flattened_vectors] # Pass as flat list
50
  )
@@ -72,33 +72,39 @@ def populate_sample_db(client):
72
  vectors = [part['vector'] for part in parts]
73
  store_program(client, code, sequence, vectors)
74
 
75
- def query_programs(client, operations, collection_name=DB_NAME, top_k=5):
76
- """Query ChromaDB for programs matching the operations sequence."""
77
  collection = create_collection(client, collection_name)
78
 
79
- # Convert operations to a query vector (average of operation vectors)
80
- query_vector = sum([create_vector(op, 0, (1, 1), 100, []) for op in operations], []) / len(operations) if operations else [0] * 6
81
-
82
- # Perform similarity search
83
- results = collection.query(
84
- query_embeddings=[query_vector],
85
- n_results=top_k,
86
- include=["documents", "metadatas"]
87
- )
 
 
 
 
 
 
 
88
 
89
  # Process results
90
  matching_programs = []
91
  for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
92
  sequence = meta['sequence'].split(',')
93
- if is_subsequence(operations, sequence):
94
- # Extract and flatten vectors from the document (assuming stored as string or list)
95
  try:
96
  doc_vectors = eval(doc['vectors']) if isinstance(doc['vectors'], str) else doc['vectors']
97
  program_vector = np.mean([v for v in doc_vectors if isinstance(v, (list, np.ndarray))], axis=0).tolist()
98
  except:
99
  program_vector = [0] * 6 # Fallback for malformed vectors
100
  similarity = cosine_similarity([query_vector], [program_vector])[0][0] if program_vector and query_vector else 0
101
- matching_programs.append({'id': meta['id'], 'code': doc, 'similarity': similarity})
102
 
103
  return sorted(matching_programs, key=lambda x: x['similarity'], reverse=True)
104
 
@@ -124,9 +130,52 @@ def is_subsequence(subseq, seq):
124
  it = iter(seq)
125
  return all(item in it for item in subseq)
126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=HF_TOKEN):
128
  """Save ChromaDB data to Hugging Face Dataset."""
129
- from datasets import Dataset
130
  client = init_chromadb()
131
  collection = create_collection(client)
132
 
@@ -135,7 +184,8 @@ def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=HF_TOKEN):
135
  data = {
136
  "code": results["documents"],
137
  "sequence": [meta["sequence"] for meta in results["metadatas"]],
138
- "vectors": [[item for sublist in vec for item in sublist] for vec in results["embeddings"]] # Flatten vectors
 
139
  }
140
 
141
  # Create a Hugging Face Dataset
@@ -146,23 +196,29 @@ def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=HF_TOKEN):
146
  print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
147
 
148
  def load_chromadb_from_hf(dataset_name=HF_DATASET_NAME, token=HF_TOKEN):
149
- """Load ChromaDB data from Hugging Face Dataset."""
150
- from datasets import load_dataset
 
 
 
 
 
 
 
 
151
  client = init_chromadb()
152
  collection = create_collection(client)
153
 
154
- dataset = load_dataset(dataset_name, split="train", token=token)
155
  for item in dataset:
156
  collection.add(
157
  documents=[item["code"]],
158
- metadatas=[{"sequence": item["sequence"]}],
159
  ids=[str(hash(item["code"]))],
160
  embeddings=[item["vectors"]]
161
  )
162
  return client
163
 
164
  if __name__ == '__main__':
165
- client = init_chromadb()
166
- populate_sample_db(client)
167
  # Uncomment to save to Hugging Face
168
  # save_chromadb_to_hf()
 
4
  import os
5
  from sklearn.metrics.pairwise import cosine_similarity
6
  import numpy as np
7
+ from datasets import Dataset, load_dataset
8
 
9
  # User-configurable variables
10
  DB_NAME = "python_programs" # ChromaDB collection name
11
+ HF_DATASET_NAME = "python_program_vectors" # Hugging Face Dataset name
12
+ HF_TOKEN = "YOUR_HUGGINGFACE_TOKEN" # Replace with your Hugging Face API token
13
  PERSIST_DIR = "./chroma_data" # Directory for persistent storage (optional)
14
 
15
  def init_chromadb(persist_dir=PERSIST_DIR):
 
44
  program_id = str(hash(code)) # Use hash of code as ID for uniqueness
45
  collection.add(
46
  documents=[code],
47
+ metadatas=[{"sequence": ",".join(sequence), "description_tokens": " ".join(generate_description_tokens(sequence, vectors))}],
48
  ids=[program_id],
49
  embeddings=[flattened_vectors] # Pass as flat list
50
  )
 
72
  vectors = [part['vector'] for part in parts]
73
  store_program(client, code, sequence, vectors)
74
 
75
+ def query_programs(client, operations, collection_name=DB_NAME, top_k=5, semantic_query=None):
76
+ """Query ChromaDB for programs matching the operations sequence or semantic description."""
77
  collection = create_collection(client, collection_name)
78
 
79
+ if semantic_query:
80
+ # Semantic search using description tokens
81
+ query_vector = generate_semantic_vector(semantic_query)
82
+ results = collection.query(
83
+ query_texts=[semantic_query],
84
+ n_results=top_k,
85
+ include=["documents", "metadatas"]
86
+ )
87
+ else:
88
+ # Vector-based search for operations sequence
89
+ query_vector = sum([create_vector(op, 0, (1, 1), 100, []) for op in operations], []) / len(operations) if operations else [0] * 6
90
+ results = collection.query(
91
+ query_embeddings=[query_vector],
92
+ n_results=top_k,
93
+ include=["documents", "metadatas"]
94
+ )
95
 
96
  # Process results
97
  matching_programs = []
98
  for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
99
  sequence = meta['sequence'].split(',')
100
+ if not semantic_query or is_subsequence(operations, sequence): # Ensure sequence match for operations
 
101
  try:
102
  doc_vectors = eval(doc['vectors']) if isinstance(doc['vectors'], str) else doc['vectors']
103
  program_vector = np.mean([v for v in doc_vectors if isinstance(v, (list, np.ndarray))], axis=0).tolist()
104
  except:
105
  program_vector = [0] * 6 # Fallback for malformed vectors
106
  similarity = cosine_similarity([query_vector], [program_vector])[0][0] if program_vector and query_vector else 0
107
+ matching_programs.append({'id': meta['id'], 'code': doc, 'similarity': similarity, 'description': meta.get('description_tokens', '')})
108
 
109
  return sorted(matching_programs, key=lambda x: x['similarity'], reverse=True)
110
 
 
130
  it = iter(seq)
131
  return all(item in it for item in subseq)
132
 
133
+ def generate_description_tokens(sequence, vectors):
134
+ """Generate semantic description tokens for a program based on its sequence and vectors."""
135
+ tokens = []
136
+ category_descriptions = {
137
+ 'import': 'imports module',
138
+ 'function': 'defines function',
139
+ 'assigned_variable': 'assigns variable',
140
+ 'input_variable': 'input parameter',
141
+ 'returned_variable': 'returns value',
142
+ 'if': 'conditional statement',
143
+ 'return': 'returns result',
144
+ 'try': 'try block',
145
+ 'except': 'exception handler',
146
+ 'expression': 'expression statement',
147
+ 'spacer': 'empty line or comment'
148
+ }
149
+
150
+ for cat, vec in zip(sequence, vectors):
151
+ if cat in category_descriptions:
152
+ tokens.append(f"{category_descriptions[cat]}:{cat}")
153
+ # Add vector-derived features (e.g., level, span) as tokens
154
+ tokens.append(f"level:{vec[1]}")
155
+ tokens.append(f"span:{vec[3]:.2f}")
156
+ return tokens
157
+
158
+ def generate_semantic_vector(description):
159
+ """Generate a semantic vector for a textual description (simplified for now)."""
160
+ # This is a placeholder—use an embedding model (e.g., CodeBERT, BERT) for real semantic search
161
+ tokens = description.lower().split()
162
+ category_weights = {
163
+ 'import': 1, 'function': 2, 'assign': 17, 'input': 18, 'return': 19, 'if': 5, 'try': 8, 'except': 14
164
+ }
165
+ vector = [0] * 6
166
+ for token in tokens:
167
+ for cat, weight in category_weights.items():
168
+ if cat in token:
169
+ vector[0] = weight # Use category_id as primary feature
170
+ vector[1] = 1 # Assume level 1 for simplicity
171
+ vector[2] = 0.5 # Center position (midpoint)
172
+ vector[3] = 0.1 # Span (small for simplicity)
173
+ vector[4] = 1 # Parent depth (shallow)
174
+ vector[5] = weight / len(category_weights) # Parent weight
175
+ return vector
176
+
177
  def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=HF_TOKEN):
178
  """Save ChromaDB data to Hugging Face Dataset."""
 
179
  client = init_chromadb()
180
  collection = create_collection(client)
181
 
 
184
  data = {
185
  "code": results["documents"],
186
  "sequence": [meta["sequence"] for meta in results["metadatas"]],
187
+ "vectors": [[item for sublist in vec for item in sublist] for vec in results["embeddings"]], # Flatten vectors
188
+ "description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]]
189
  }
190
 
191
  # Create a Hugging Face Dataset
 
196
  print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
197
 
198
  def load_chromadb_from_hf(dataset_name=HF_DATASET_NAME, token=HF_TOKEN):
199
+ """Load ChromaDB data from Hugging Face Dataset, handle empty dataset."""
200
+ try:
201
+ dataset = load_dataset(dataset_name, split="train", token=token)
202
+ except Exception as e:
203
+ print(f"Error loading dataset from Hugging Face: {e}. Populating with samples...")
204
+ client = init_chromadb()
205
+ populate_sample_db(client)
206
+ save_chromadb_to_hf() # Create and push a new dataset
207
+ return init_chromadb()
208
+
209
  client = init_chromadb()
210
  collection = create_collection(client)
211
 
 
212
  for item in dataset:
213
  collection.add(
214
  documents=[item["code"]],
215
+ metadatas=[{"sequence": item["sequence"], "description_tokens": item["description_tokens"]}],
216
  ids=[str(hash(item["code"]))],
217
  embeddings=[item["vectors"]]
218
  )
219
  return client
220
 
221
  if __name__ == '__main__':
222
+ client = load_chromadb_from_hf()
 
223
  # Uncomment to save to Hugging Face
224
  # save_chromadb_to_hf()