Spaces:
Running
Running
Update database.py
Browse files- database.py +82 -26
database.py
CHANGED
@@ -4,12 +4,12 @@ from parser import parse_python_code
|
|
4 |
import os
|
5 |
from sklearn.metrics.pairwise import cosine_similarity
|
6 |
import numpy as np
|
7 |
-
import
|
8 |
|
9 |
# User-configurable variables
|
10 |
DB_NAME = "python_programs" # ChromaDB collection name
|
11 |
-
HF_DATASET_NAME = "
|
12 |
-
HF_TOKEN =
|
13 |
PERSIST_DIR = "./chroma_data" # Directory for persistent storage (optional)
|
14 |
|
15 |
def init_chromadb(persist_dir=PERSIST_DIR):
|
@@ -44,7 +44,7 @@ def store_program(client, code, sequence, vectors, collection_name=DB_NAME):
|
|
44 |
program_id = str(hash(code)) # Use hash of code as ID for uniqueness
|
45 |
collection.add(
|
46 |
documents=[code],
|
47 |
-
metadatas=[{"sequence": ",".join(sequence)}],
|
48 |
ids=[program_id],
|
49 |
embeddings=[flattened_vectors] # Pass as flat list
|
50 |
)
|
@@ -72,33 +72,39 @@ def populate_sample_db(client):
|
|
72 |
vectors = [part['vector'] for part in parts]
|
73 |
store_program(client, code, sequence, vectors)
|
74 |
|
75 |
-
def query_programs(client, operations, collection_name=DB_NAME, top_k=5):
|
76 |
-
"""Query ChromaDB for programs matching the operations sequence."""
|
77 |
collection = create_collection(client, collection_name)
|
78 |
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
89 |
# Process results
|
90 |
matching_programs = []
|
91 |
for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
|
92 |
sequence = meta['sequence'].split(',')
|
93 |
-
if is_subsequence(operations, sequence):
|
94 |
-
# Extract and flatten vectors from the document (assuming stored as string or list)
|
95 |
try:
|
96 |
doc_vectors = eval(doc['vectors']) if isinstance(doc['vectors'], str) else doc['vectors']
|
97 |
program_vector = np.mean([v for v in doc_vectors if isinstance(v, (list, np.ndarray))], axis=0).tolist()
|
98 |
except:
|
99 |
program_vector = [0] * 6 # Fallback for malformed vectors
|
100 |
similarity = cosine_similarity([query_vector], [program_vector])[0][0] if program_vector and query_vector else 0
|
101 |
-
matching_programs.append({'id': meta['id'], 'code': doc, 'similarity': similarity})
|
102 |
|
103 |
return sorted(matching_programs, key=lambda x: x['similarity'], reverse=True)
|
104 |
|
@@ -124,9 +130,52 @@ def is_subsequence(subseq, seq):
|
|
124 |
it = iter(seq)
|
125 |
return all(item in it for item in subseq)
|
126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=HF_TOKEN):
|
128 |
"""Save ChromaDB data to Hugging Face Dataset."""
|
129 |
-
from datasets import Dataset
|
130 |
client = init_chromadb()
|
131 |
collection = create_collection(client)
|
132 |
|
@@ -135,7 +184,8 @@ def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=HF_TOKEN):
|
|
135 |
data = {
|
136 |
"code": results["documents"],
|
137 |
"sequence": [meta["sequence"] for meta in results["metadatas"]],
|
138 |
-
"vectors": [[item for sublist in vec for item in sublist] for vec in results["embeddings"]] # Flatten vectors
|
|
|
139 |
}
|
140 |
|
141 |
# Create a Hugging Face Dataset
|
@@ -146,23 +196,29 @@ def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=HF_TOKEN):
|
|
146 |
print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
|
147 |
|
148 |
def load_chromadb_from_hf(dataset_name=HF_DATASET_NAME, token=HF_TOKEN):
|
149 |
-
"""Load ChromaDB data from Hugging Face Dataset."""
|
150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
client = init_chromadb()
|
152 |
collection = create_collection(client)
|
153 |
|
154 |
-
dataset = load_dataset(dataset_name, split="train", token=token)
|
155 |
for item in dataset:
|
156 |
collection.add(
|
157 |
documents=[item["code"]],
|
158 |
-
metadatas=[{"sequence": item["sequence"]}],
|
159 |
ids=[str(hash(item["code"]))],
|
160 |
embeddings=[item["vectors"]]
|
161 |
)
|
162 |
return client
|
163 |
|
164 |
if __name__ == '__main__':
|
165 |
-
client =
|
166 |
-
populate_sample_db(client)
|
167 |
# Uncomment to save to Hugging Face
|
168 |
# save_chromadb_to_hf()
|
|
|
4 |
import os
|
5 |
from sklearn.metrics.pairwise import cosine_similarity
|
6 |
import numpy as np
|
7 |
+
from datasets import Dataset, load_dataset
|
8 |
|
9 |
# User-configurable variables
|
10 |
DB_NAME = "python_programs" # ChromaDB collection name
|
11 |
+
HF_DATASET_NAME = "python_program_vectors" # Hugging Face Dataset name
|
12 |
+
HF_TOKEN = "YOUR_HUGGINGFACE_TOKEN" # Replace with your Hugging Face API token
|
13 |
PERSIST_DIR = "./chroma_data" # Directory for persistent storage (optional)
|
14 |
|
15 |
def init_chromadb(persist_dir=PERSIST_DIR):
|
|
|
44 |
program_id = str(hash(code)) # Use hash of code as ID for uniqueness
|
45 |
collection.add(
|
46 |
documents=[code],
|
47 |
+
metadatas=[{"sequence": ",".join(sequence), "description_tokens": " ".join(generate_description_tokens(sequence, vectors))}],
|
48 |
ids=[program_id],
|
49 |
embeddings=[flattened_vectors] # Pass as flat list
|
50 |
)
|
|
|
72 |
vectors = [part['vector'] for part in parts]
|
73 |
store_program(client, code, sequence, vectors)
|
74 |
|
75 |
+
def query_programs(client, operations, collection_name=DB_NAME, top_k=5, semantic_query=None):
|
76 |
+
"""Query ChromaDB for programs matching the operations sequence or semantic description."""
|
77 |
collection = create_collection(client, collection_name)
|
78 |
|
79 |
+
if semantic_query:
|
80 |
+
# Semantic search using description tokens
|
81 |
+
query_vector = generate_semantic_vector(semantic_query)
|
82 |
+
results = collection.query(
|
83 |
+
query_texts=[semantic_query],
|
84 |
+
n_results=top_k,
|
85 |
+
include=["documents", "metadatas"]
|
86 |
+
)
|
87 |
+
else:
|
88 |
+
# Vector-based search for operations sequence
|
89 |
+
query_vector = sum([create_vector(op, 0, (1, 1), 100, []) for op in operations], []) / len(operations) if operations else [0] * 6
|
90 |
+
results = collection.query(
|
91 |
+
query_embeddings=[query_vector],
|
92 |
+
n_results=top_k,
|
93 |
+
include=["documents", "metadatas"]
|
94 |
+
)
|
95 |
|
96 |
# Process results
|
97 |
matching_programs = []
|
98 |
for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
|
99 |
sequence = meta['sequence'].split(',')
|
100 |
+
if not semantic_query or is_subsequence(operations, sequence): # Ensure sequence match for operations
|
|
|
101 |
try:
|
102 |
doc_vectors = eval(doc['vectors']) if isinstance(doc['vectors'], str) else doc['vectors']
|
103 |
program_vector = np.mean([v for v in doc_vectors if isinstance(v, (list, np.ndarray))], axis=0).tolist()
|
104 |
except:
|
105 |
program_vector = [0] * 6 # Fallback for malformed vectors
|
106 |
similarity = cosine_similarity([query_vector], [program_vector])[0][0] if program_vector and query_vector else 0
|
107 |
+
matching_programs.append({'id': meta['id'], 'code': doc, 'similarity': similarity, 'description': meta.get('description_tokens', '')})
|
108 |
|
109 |
return sorted(matching_programs, key=lambda x: x['similarity'], reverse=True)
|
110 |
|
|
|
130 |
it = iter(seq)
|
131 |
return all(item in it for item in subseq)
|
132 |
|
133 |
+
def generate_description_tokens(sequence, vectors):
|
134 |
+
"""Generate semantic description tokens for a program based on its sequence and vectors."""
|
135 |
+
tokens = []
|
136 |
+
category_descriptions = {
|
137 |
+
'import': 'imports module',
|
138 |
+
'function': 'defines function',
|
139 |
+
'assigned_variable': 'assigns variable',
|
140 |
+
'input_variable': 'input parameter',
|
141 |
+
'returned_variable': 'returns value',
|
142 |
+
'if': 'conditional statement',
|
143 |
+
'return': 'returns result',
|
144 |
+
'try': 'try block',
|
145 |
+
'except': 'exception handler',
|
146 |
+
'expression': 'expression statement',
|
147 |
+
'spacer': 'empty line or comment'
|
148 |
+
}
|
149 |
+
|
150 |
+
for cat, vec in zip(sequence, vectors):
|
151 |
+
if cat in category_descriptions:
|
152 |
+
tokens.append(f"{category_descriptions[cat]}:{cat}")
|
153 |
+
# Add vector-derived features (e.g., level, span) as tokens
|
154 |
+
tokens.append(f"level:{vec[1]}")
|
155 |
+
tokens.append(f"span:{vec[3]:.2f}")
|
156 |
+
return tokens
|
157 |
+
|
158 |
+
def generate_semantic_vector(description):
|
159 |
+
"""Generate a semantic vector for a textual description (simplified for now)."""
|
160 |
+
# This is a placeholder—use an embedding model (e.g., CodeBERT, BERT) for real semantic search
|
161 |
+
tokens = description.lower().split()
|
162 |
+
category_weights = {
|
163 |
+
'import': 1, 'function': 2, 'assign': 17, 'input': 18, 'return': 19, 'if': 5, 'try': 8, 'except': 14
|
164 |
+
}
|
165 |
+
vector = [0] * 6
|
166 |
+
for token in tokens:
|
167 |
+
for cat, weight in category_weights.items():
|
168 |
+
if cat in token:
|
169 |
+
vector[0] = weight # Use category_id as primary feature
|
170 |
+
vector[1] = 1 # Assume level 1 for simplicity
|
171 |
+
vector[2] = 0.5 # Center position (midpoint)
|
172 |
+
vector[3] = 0.1 # Span (small for simplicity)
|
173 |
+
vector[4] = 1 # Parent depth (shallow)
|
174 |
+
vector[5] = weight / len(category_weights) # Parent weight
|
175 |
+
return vector
|
176 |
+
|
177 |
def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=HF_TOKEN):
|
178 |
"""Save ChromaDB data to Hugging Face Dataset."""
|
|
|
179 |
client = init_chromadb()
|
180 |
collection = create_collection(client)
|
181 |
|
|
|
184 |
data = {
|
185 |
"code": results["documents"],
|
186 |
"sequence": [meta["sequence"] for meta in results["metadatas"]],
|
187 |
+
"vectors": [[item for sublist in vec for item in sublist] for vec in results["embeddings"]], # Flatten vectors
|
188 |
+
"description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]]
|
189 |
}
|
190 |
|
191 |
# Create a Hugging Face Dataset
|
|
|
196 |
print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
|
197 |
|
198 |
def load_chromadb_from_hf(dataset_name=HF_DATASET_NAME, token=HF_TOKEN):
|
199 |
+
"""Load ChromaDB data from Hugging Face Dataset, handle empty dataset."""
|
200 |
+
try:
|
201 |
+
dataset = load_dataset(dataset_name, split="train", token=token)
|
202 |
+
except Exception as e:
|
203 |
+
print(f"Error loading dataset from Hugging Face: {e}. Populating with samples...")
|
204 |
+
client = init_chromadb()
|
205 |
+
populate_sample_db(client)
|
206 |
+
save_chromadb_to_hf() # Create and push a new dataset
|
207 |
+
return init_chromadb()
|
208 |
+
|
209 |
client = init_chromadb()
|
210 |
collection = create_collection(client)
|
211 |
|
|
|
212 |
for item in dataset:
|
213 |
collection.add(
|
214 |
documents=[item["code"]],
|
215 |
+
metadatas=[{"sequence": item["sequence"], "description_tokens": item["description_tokens"]}],
|
216 |
ids=[str(hash(item["code"]))],
|
217 |
embeddings=[item["vectors"]]
|
218 |
)
|
219 |
return client
|
220 |
|
221 |
if __name__ == '__main__':
|
222 |
+
client = load_chromadb_from_hf()
|
|
|
223 |
# Uncomment to save to Hugging Face
|
224 |
# save_chromadb_to_hf()
|