Spaces:

broadfield-dev
/

parse_py

Sleeping

broadfield-dev commited on Mar 5

Commit

927956e

verified ·

1 Parent(s): 23a1178

Update process_hf_dataset.py

Files changed (1) hide show

process_hf_dataset.py CHANGED Viewed

@@ -121,7 +121,7 @@ def generate_semantic_vector(description, total_lines=100):
     return vector
 def process_hf_dataset():
-    """Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories."""
     # Load the dataset
     dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
@@ -145,11 +145,14 @@ def process_hf_dataset():
         description_tokens_list = generate_description_tokens(sequence, vectors, var_map)
         description_tokens += " " + " ".join(description_tokens_list)
-        # Generate a 6D semantic vector for the instruction
         semantic_vector = generate_semantic_vector(instruction)
-        # Store in ChromaDB with description and semantic vector
-        store_program(client, processed_code, sequence, vectors, DB_NAME)
         # Update metadata with instruction and variable roles as description
         collection = client.get_collection(DB_NAME)
@@ -157,7 +160,7 @@ def process_hf_dataset():
         collection.update(
             ids=[program_id],
             metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens}],
-            embeddings=[semantic_vector]  # Update with 6D semantic vector for semantic search
         )
     # Save to Hugging Face Dataset

     return vector
 def process_hf_dataset():
+    """Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories and including instruction in vectors."""
     # Load the dataset
     dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
         description_tokens_list = generate_description_tokens(sequence, vectors, var_map)
         description_tokens += " " + " ".join(description_tokens_list)
+        # Generate a 6D semantic vector for the instruction, incorporating it into the program vector
         semantic_vector = generate_semantic_vector(instruction)
+        # Combine program vectors with instruction vector (average or concatenate, but ensure 6D)
+        combined_vector = semantic_vector  # Use semantic vector as primary for semantic search
+        # Store in ChromaDB with description and combined vector
+        store_program(client, processed_code, sequence, [combined_vector], DB_NAME)
         # Update metadata with instruction and variable roles as description
         collection = client.get_collection(DB_NAME)
         collection.update(
             ids=[program_id],
             metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens}],
+            embeddings=[combined_vector]  # Ensure 6D embedding
         )
     # Save to Hugging Face Dataset