Spaces:
Running
Running
Update database.py
Browse files- database.py +15 -36
database.py
CHANGED
@@ -44,15 +44,16 @@ def store_program(client, code, sequence, vectors, collection_name=DB_NAME):
|
|
44 |
collection = create_collection(client, collection_name)
|
45 |
|
46 |
# Flatten vectors to ensure they are a list of numbers (ChromaDB expects flat embeddings)
|
47 |
-
|
|
|
48 |
|
49 |
# Store program data (ID, code, sequence, vectors)
|
50 |
program_id = str(hash(code)) # Use hash of code as ID for uniqueness
|
51 |
collection.add(
|
52 |
documents=[code],
|
53 |
-
metadatas=[{"sequence": ",".join(sequence), "description_tokens": " ".join(generate_description_tokens(sequence, vectors))}],
|
54 |
ids=[program_id],
|
55 |
-
embeddings=[flattened_vectors] # Pass as
|
56 |
)
|
57 |
return program_id
|
58 |
|
@@ -106,15 +107,17 @@ def query_programs(client, operations, collection_name=DB_NAME, top_k=5, semanti
|
|
106 |
if not semantic_query or is_subsequence(operations, sequence): # Ensure sequence match for operations
|
107 |
try:
|
108 |
# Reconstruct program vectors (flatten if needed)
|
109 |
-
doc_vectors = eval(
|
110 |
if isinstance(doc_vectors, (list, np.ndarray)) and len(doc_vectors) == 6:
|
111 |
program_vector = doc_vectors # Single flat vector
|
112 |
else:
|
113 |
program_vector = np.mean([v for v in doc_vectors if isinstance(v, (list, np.ndarray))], axis=0).tolist()
|
114 |
except:
|
115 |
program_vector = [0] * 6 # Fallback for malformed vectors
|
116 |
-
|
117 |
-
|
|
|
|
|
118 |
|
119 |
return sorted(matching_programs, key=lambda x: x['similarity'], reverse=True)
|
120 |
|
@@ -165,32 +168,7 @@ def generate_description_tokens(sequence, vectors):
|
|
165 |
tokens.append(f"span:{vec[3]:.2f}")
|
166 |
return tokens
|
167 |
|
168 |
-
def
|
169 |
-
"""Generate a 6D semantic vector for a textual description, matching our vector format."""
|
170 |
-
# Use a simplified heuristic to map description to our 6D vector format
|
171 |
-
category_map = {
|
172 |
-
'import': 1, 'function': 2, 'assign': 17, 'input': 18, 'return': 19, 'if': 5, 'try': 8, 'except': 14
|
173 |
-
}
|
174 |
-
|
175 |
-
# Parse description for key terms
|
176 |
-
tokens = description.lower().split()
|
177 |
-
vector = [0] * 6 # Initialize 6D vector
|
178 |
-
|
179 |
-
# Map description tokens to categories and assign basic vector values
|
180 |
-
for token in tokens:
|
181 |
-
for cat, cat_id in category_map.items():
|
182 |
-
if cat in token:
|
183 |
-
vector[0] = cat_id # category_id
|
184 |
-
vector[1] = 1 # level (assume top-level for simplicity)
|
185 |
-
vector[2] = 0.5 # center_pos (midpoint of code)
|
186 |
-
vector[3] = 0.1 # span (small for simplicity)
|
187 |
-
vector[4] = 1 # parent_depth (shallow)
|
188 |
-
vector[5] = cat_id / len(category_map) # parent_weight (normalized)
|
189 |
-
break
|
190 |
-
|
191 |
-
return vector
|
192 |
-
|
193 |
-
def generate_semantic_vector(description, total_lines=100, use_gpu=False):
|
194 |
"""Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D."""
|
195 |
# Load CodeBERT model and tokenizer
|
196 |
model_name = "microsoft/codebert-base"
|
@@ -226,8 +204,9 @@ def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY"))
|
|
226 |
data = {
|
227 |
"code": results["documents"],
|
228 |
"sequence": [meta["sequence"] for meta in results["metadatas"]],
|
229 |
-
"vectors": results["embeddings"], #
|
230 |
-
"description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]]
|
|
|
231 |
}
|
232 |
|
233 |
# Create a Hugging Face Dataset
|
@@ -254,9 +233,9 @@ def load_chromadb_from_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY"
|
|
254 |
for item in dataset:
|
255 |
collection.add(
|
256 |
documents=[item["code"]],
|
257 |
-
metadatas=[{"sequence": item["sequence"], "description_tokens": item["description_tokens"]}],
|
258 |
ids=[str(hash(item["code"]))],
|
259 |
-
embeddings=[item["vectors"]]
|
260 |
)
|
261 |
return client
|
262 |
|
|
|
44 |
collection = create_collection(client, collection_name)
|
45 |
|
46 |
# Flatten vectors to ensure they are a list of numbers (ChromaDB expects flat embeddings)
|
47 |
+
# Use the first vector (semantic vector) for ChromaDB embedding
|
48 |
+
flattened_vectors = vectors[0] if vectors else [0] * 6 # Ensure 6D
|
49 |
|
50 |
# Store program data (ID, code, sequence, vectors)
|
51 |
program_id = str(hash(code)) # Use hash of code as ID for uniqueness
|
52 |
collection.add(
|
53 |
documents=[code],
|
54 |
+
metadatas=[{"sequence": ",".join(sequence), "description_tokens": " ".join(generate_description_tokens(sequence, vectors)), "program_vectors": str(vectors)}],
|
55 |
ids=[program_id],
|
56 |
+
embeddings=[flattened_vectors] # Pass as 6D semantic vector
|
57 |
)
|
58 |
return program_id
|
59 |
|
|
|
107 |
if not semantic_query or is_subsequence(operations, sequence): # Ensure sequence match for operations
|
108 |
try:
|
109 |
# Reconstruct program vectors (flatten if needed)
|
110 |
+
doc_vectors = eval(meta['program_vectors']) if isinstance(meta['program_vectors'], str) else meta['program_vectors']
|
111 |
if isinstance(doc_vectors, (list, np.ndarray)) and len(doc_vectors) == 6:
|
112 |
program_vector = doc_vectors # Single flat vector
|
113 |
else:
|
114 |
program_vector = np.mean([v for v in doc_vectors if isinstance(v, (list, np.ndarray))], axis=0).tolist()
|
115 |
except:
|
116 |
program_vector = [0] * 6 # Fallback for malformed vectors
|
117 |
+
# Use the semantic embedding for similarity
|
118 |
+
semantic_vector = eval(doc['vectors']) if isinstance(doc['vectors'], str) else doc['vectors']
|
119 |
+
similarity = cosine_similarity([query_vector], [semantic_vector])[0][0] if semantic_vector and query_vector else 0
|
120 |
+
matching_programs.append({'id': meta['id'], 'code': doc, 'similarity': similarity, 'description': meta.get('description_tokens', ''), 'program_vectors': meta.get('program_vectors', '[]')})
|
121 |
|
122 |
return sorted(matching_programs, key=lambda x: x['similarity'], reverse=True)
|
123 |
|
|
|
168 |
tokens.append(f"span:{vec[3]:.2f}")
|
169 |
return tokens
|
170 |
|
171 |
+
def generate_semantic_vector(description, total_lines=100, use_gpu=USE_GPU):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
"""Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D."""
|
173 |
# Load CodeBERT model and tokenizer
|
174 |
model_name = "microsoft/codebert-base"
|
|
|
204 |
data = {
|
205 |
"code": results["documents"],
|
206 |
"sequence": [meta["sequence"] for meta in results["metadatas"]],
|
207 |
+
"vectors": results["embeddings"], # Semantic 6D vectors
|
208 |
+
"description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]],
|
209 |
+
"program_vectors": [eval(meta.get('program_vectors', '[]')) for meta in results["metadatas"]] # Store structural vectors
|
210 |
}
|
211 |
|
212 |
# Create a Hugging Face Dataset
|
|
|
233 |
for item in dataset:
|
234 |
collection.add(
|
235 |
documents=[item["code"]],
|
236 |
+
metadatas=[{"sequence": item["sequence"], "description_tokens": item["description_tokens"], "program_vectors": str(item["program_vectors"])}],
|
237 |
ids=[str(hash(item["code"]))],
|
238 |
+
embeddings=[item["vectors"]] # Use semantic 6D vectors
|
239 |
)
|
240 |
return client
|
241 |
|