Spaces:
Running
Running
Update database.py
Browse files- database.py +22 -22
database.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
# database.py
|
2 |
import chromadb
|
3 |
-
from parser import parse_python_code
|
4 |
import os
|
5 |
from sklearn.metrics.pairwise import cosine_similarity
|
6 |
import numpy as np
|
@@ -83,7 +83,7 @@ def query_programs(client, operations, collection_name=DB_NAME, top_k=5, semanti
|
|
83 |
collection = create_collection(client, collection_name)
|
84 |
|
85 |
if semantic_query:
|
86 |
-
# Semantic search using
|
87 |
query_vector = generate_semantic_vector(semantic_query)
|
88 |
results = collection.query(
|
89 |
query_embeddings=[query_vector],
|
@@ -165,29 +165,29 @@ def generate_description_tokens(sequence, vectors):
|
|
165 |
tokens.append(f"span:{vec[3]:.2f}")
|
166 |
return tokens
|
167 |
|
168 |
-
def generate_semantic_vector(description,
|
169 |
-
"""Generate a semantic vector for a textual description
|
170 |
-
#
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
model = AutoModel.from_pretrained(model_name).to(device)
|
175 |
|
176 |
-
#
|
177 |
-
|
178 |
-
|
179 |
|
180 |
-
#
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
|
186 |
-
# Truncate or pad to 6D to match our vectors
|
187 |
-
if len(vector) < 6:
|
188 |
-
vector.extend([0] * (6 - len(vector)))
|
189 |
-
elif len(vector) > 6:
|
190 |
-
vector = vector[:6]
|
191 |
return vector
|
192 |
|
193 |
def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
|
|
|
1 |
# database.py
|
2 |
import chromadb
|
3 |
+
from parser import parse_python_code, create_vector
|
4 |
import os
|
5 |
from sklearn.metrics.pairwise import cosine_similarity
|
6 |
import numpy as np
|
|
|
83 |
collection = create_collection(client, collection_name)
|
84 |
|
85 |
if semantic_query:
|
86 |
+
# Semantic search using a 6D vector generated from the description
|
87 |
query_vector = generate_semantic_vector(semantic_query)
|
88 |
results = collection.query(
|
89 |
query_embeddings=[query_vector],
|
|
|
165 |
tokens.append(f"span:{vec[3]:.2f}")
|
166 |
return tokens
|
167 |
|
168 |
+
def generate_semantic_vector(description, total_lines=100):
|
169 |
+
"""Generate a 6D semantic vector for a textual description, matching our vector format."""
|
170 |
+
# Use a simplified heuristic to map description to our 6D vector format
|
171 |
+
category_map = {
|
172 |
+
'import': 1, 'function': 2, 'assign': 17, 'input': 18, 'return': 19, 'if': 5, 'try': 8, 'except': 14
|
173 |
+
}
|
|
|
174 |
|
175 |
+
# Parse description for key terms
|
176 |
+
tokens = description.lower().split()
|
177 |
+
vector = [0] * 6 # Initialize 6D vector
|
178 |
|
179 |
+
# Map description tokens to categories and assign basic vector values
|
180 |
+
for token in tokens:
|
181 |
+
for cat, cat_id in category_map.items():
|
182 |
+
if cat in token:
|
183 |
+
vector[0] = cat_id # category_id
|
184 |
+
vector[1] = 1 # level (assume top-level for simplicity)
|
185 |
+
vector[2] = 0.5 # center_pos (midpoint of code)
|
186 |
+
vector[3] = 0.1 # span (small for simplicity)
|
187 |
+
vector[4] = 1 # parent_depth (shallow)
|
188 |
+
vector[5] = cat_id / len(category_map) # parent_weight (normalized)
|
189 |
+
break
|
190 |
|
|
|
|
|
|
|
|
|
|
|
191 |
return vector
|
192 |
|
193 |
def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
|