broadfield-dev commited on
Commit
23a1178
·
verified ·
1 Parent(s): d6c93c4

Update database.py

Browse files
Files changed (1) hide show
  1. database.py +22 -22
database.py CHANGED
@@ -1,6 +1,6 @@
1
  # database.py
2
  import chromadb
3
- from parser import parse_python_code
4
  import os
5
  from sklearn.metrics.pairwise import cosine_similarity
6
  import numpy as np
@@ -83,7 +83,7 @@ def query_programs(client, operations, collection_name=DB_NAME, top_k=5, semanti
83
  collection = create_collection(client, collection_name)
84
 
85
  if semantic_query:
86
- # Semantic search using CodeBERT embeddings
87
  query_vector = generate_semantic_vector(semantic_query)
88
  results = collection.query(
89
  query_embeddings=[query_vector],
@@ -165,29 +165,29 @@ def generate_description_tokens(sequence, vectors):
165
  tokens.append(f"span:{vec[3]:.2f}")
166
  return tokens
167
 
168
- def generate_semantic_vector(description, use_gpu=USE_GPU):
169
- """Generate a semantic vector for a textual description using CodeBERT, with CPU/GPU option."""
170
- # Load CodeBERT model and tokenizer
171
- model_name = "microsoft/codebert-base"
172
- tokenizer = AutoTokenizer.from_pretrained(model_name)
173
- device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
174
- model = AutoModel.from_pretrained(model_name).to(device)
175
 
176
- # Tokenize and encode the description
177
- inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
178
- inputs = {k: v.to(device) for k, v in inputs.items()}
179
 
180
- # Generate embeddings
181
- with torch.no_grad():
182
- outputs = model(**inputs)
183
- # Use mean pooling of the last hidden states
184
- vector = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy().tolist()
 
 
 
 
 
 
185
 
186
- # Truncate or pad to 6D to match our vectors
187
- if len(vector) < 6:
188
- vector.extend([0] * (6 - len(vector)))
189
- elif len(vector) > 6:
190
- vector = vector[:6]
191
  return vector
192
 
193
  def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
 
1
  # database.py
2
  import chromadb
3
+ from parser import parse_python_code, create_vector
4
  import os
5
  from sklearn.metrics.pairwise import cosine_similarity
6
  import numpy as np
 
83
  collection = create_collection(client, collection_name)
84
 
85
  if semantic_query:
86
+ # Semantic search using a 6D vector generated from the description
87
  query_vector = generate_semantic_vector(semantic_query)
88
  results = collection.query(
89
  query_embeddings=[query_vector],
 
165
  tokens.append(f"span:{vec[3]:.2f}")
166
  return tokens
167
 
168
+ def generate_semantic_vector(description, total_lines=100):
169
+ """Generate a 6D semantic vector for a textual description, matching our vector format."""
170
+ # Use a simplified heuristic to map description to our 6D vector format
171
+ category_map = {
172
+ 'import': 1, 'function': 2, 'assign': 17, 'input': 18, 'return': 19, 'if': 5, 'try': 8, 'except': 14
173
+ }
 
174
 
175
+ # Parse description for key terms
176
+ tokens = description.lower().split()
177
+ vector = [0] * 6 # Initialize 6D vector
178
 
179
+ # Map description tokens to categories and assign basic vector values
180
+ for token in tokens:
181
+ for cat, cat_id in category_map.items():
182
+ if cat in token:
183
+ vector[0] = cat_id # category_id
184
+ vector[1] = 1 # level (assume top-level for simplicity)
185
+ vector[2] = 0.5 # center_pos (midpoint of code)
186
+ vector[3] = 0.1 # span (small for simplicity)
187
+ vector[4] = 1 # parent_depth (shallow)
188
+ vector[5] = cat_id / len(category_map) # parent_weight (normalized)
189
+ break
190
 
 
 
 
 
 
191
  return vector
192
 
193
  def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):