Spaces:
Running
Running
Update process_hf_dataset.py
Browse files- process_hf_dataset.py +21 -6
process_hf_dataset.py
CHANGED
@@ -141,6 +141,25 @@ def generate_semantic_vector(description, total_lines=100, use_gpu=False):
|
|
141 |
elif len(vector) > 6:
|
142 |
vector = vector[:6] # Truncate to 6D
|
143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
return vector
|
145 |
|
146 |
def process_hf_dataset(batch_size=100, use_gpu=False):
|
@@ -154,12 +173,8 @@ def process_hf_dataset(batch_size=100, use_gpu=False):
|
|
154 |
# Initialize ChromaDB client
|
155 |
client = init_chromadb()
|
156 |
|
157 |
-
#
|
158 |
-
|
159 |
-
client.delete_collection(DB_NAME)
|
160 |
-
except:
|
161 |
-
pass # Collection may not exist
|
162 |
-
collection = client.create_collection(DB_NAME)
|
163 |
|
164 |
# Process in batches with progress bar
|
165 |
total_entries = len(dataset_list)
|
|
|
141 |
elif len(vector) > 6:
|
142 |
vector = vector[:6] # Truncate to 6D
|
143 |
|
144 |
+
# Ensure vector isn’t all zeros or defaults
|
145 |
+
if all(v == 0 for v in vector):
|
146 |
+
# Fallback: Use heuristic if CodeBERT fails to generate meaningful embeddings
|
147 |
+
category_map = {
|
148 |
+
'import': 1, 'function': 2, 'assign': 17, 'input': 18, 'return': 19, 'if': 5, 'try': 8, 'except': 14
|
149 |
+
}
|
150 |
+
tokens = description.lower().split()
|
151 |
+
vector = [0] * 6
|
152 |
+
for token in tokens:
|
153 |
+
for cat, cat_id in category_map.items():
|
154 |
+
if cat in token:
|
155 |
+
vector[0] = cat_id # category_id
|
156 |
+
vector[1] = 1 # level
|
157 |
+
vector[2] = 0.5 # center_pos
|
158 |
+
vector[3] = 0.1 # span
|
159 |
+
vector[4] = 1 # parent_depth
|
160 |
+
vector[5] = cat_id / len(category_map) # parent_weight
|
161 |
+
break
|
162 |
+
|
163 |
return vector
|
164 |
|
165 |
def process_hf_dataset(batch_size=100, use_gpu=False):
|
|
|
173 |
# Initialize ChromaDB client
|
174 |
client = init_chromadb()
|
175 |
|
176 |
+
# Do not clear or populate with defaults here—let UI buttons handle this
|
177 |
+
collection = client.get_or_create_collection(DB_NAME)
|
|
|
|
|
|
|
|
|
178 |
|
179 |
# Process in batches with progress bar
|
180 |
total_entries = len(dataset_list)
|