broadfield-dev commited on
Commit
18f44de
·
verified ·
1 Parent(s): 16ea922

Update process_hf_dataset.py

Browse files
Files changed (1) hide show
  1. process_hf_dataset.py +21 -6
process_hf_dataset.py CHANGED
@@ -141,6 +141,25 @@ def generate_semantic_vector(description, total_lines=100, use_gpu=False):
141
  elif len(vector) > 6:
142
  vector = vector[:6] # Truncate to 6D
143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  return vector
145
 
146
  def process_hf_dataset(batch_size=100, use_gpu=False):
@@ -154,12 +173,8 @@ def process_hf_dataset(batch_size=100, use_gpu=False):
154
  # Initialize ChromaDB client
155
  client = init_chromadb()
156
 
157
- # Clear existing collection (fresh install) if needed
158
- try:
159
- client.delete_collection(DB_NAME)
160
- except:
161
- pass # Collection may not exist
162
- collection = client.create_collection(DB_NAME)
163
 
164
  # Process in batches with progress bar
165
  total_entries = len(dataset_list)
 
141
  elif len(vector) > 6:
142
  vector = vector[:6] # Truncate to 6D
143
 
144
+ # Ensure vector isn’t all zeros or defaults
145
+ if all(v == 0 for v in vector):
146
+ # Fallback: Use heuristic if CodeBERT fails to generate meaningful embeddings
147
+ category_map = {
148
+ 'import': 1, 'function': 2, 'assign': 17, 'input': 18, 'return': 19, 'if': 5, 'try': 8, 'except': 14
149
+ }
150
+ tokens = description.lower().split()
151
+ vector = [0] * 6
152
+ for token in tokens:
153
+ for cat, cat_id in category_map.items():
154
+ if cat in token:
155
+ vector[0] = cat_id # category_id
156
+ vector[1] = 1 # level
157
+ vector[2] = 0.5 # center_pos
158
+ vector[3] = 0.1 # span
159
+ vector[4] = 1 # parent_depth
160
+ vector[5] = cat_id / len(category_map) # parent_weight
161
+ break
162
+
163
  return vector
164
 
165
  def process_hf_dataset(batch_size=100, use_gpu=False):
 
173
  # Initialize ChromaDB client
174
  client = init_chromadb()
175
 
176
+ # Do not clear or populate with defaults here—let UI buttons handle this
177
+ collection = client.get_or_create_collection(DB_NAME)
 
 
 
 
178
 
179
  # Process in batches with progress bar
180
  total_entries = len(dataset_list)