broadfield-dev commited on
Commit
927956e
·
verified ·
1 Parent(s): 23a1178

Update process_hf_dataset.py

Browse files
Files changed (1) hide show
  1. process_hf_dataset.py +8 -5
process_hf_dataset.py CHANGED
@@ -121,7 +121,7 @@ def generate_semantic_vector(description, total_lines=100):
121
  return vector
122
 
123
  def process_hf_dataset():
124
- """Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories."""
125
  # Load the dataset
126
  dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
127
 
@@ -145,11 +145,14 @@ def process_hf_dataset():
145
  description_tokens_list = generate_description_tokens(sequence, vectors, var_map)
146
  description_tokens += " " + " ".join(description_tokens_list)
147
 
148
- # Generate a 6D semantic vector for the instruction
149
  semantic_vector = generate_semantic_vector(instruction)
150
 
151
- # Store in ChromaDB with description and semantic vector
152
- store_program(client, processed_code, sequence, vectors, DB_NAME)
 
 
 
153
 
154
  # Update metadata with instruction and variable roles as description
155
  collection = client.get_collection(DB_NAME)
@@ -157,7 +160,7 @@ def process_hf_dataset():
157
  collection.update(
158
  ids=[program_id],
159
  metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens}],
160
- embeddings=[semantic_vector] # Update with 6D semantic vector for semantic search
161
  )
162
 
163
  # Save to Hugging Face Dataset
 
121
  return vector
122
 
123
  def process_hf_dataset():
124
+ """Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories and including instruction in vectors."""
125
  # Load the dataset
126
  dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
127
 
 
145
  description_tokens_list = generate_description_tokens(sequence, vectors, var_map)
146
  description_tokens += " " + " ".join(description_tokens_list)
147
 
148
+ # Generate a 6D semantic vector for the instruction, incorporating it into the program vector
149
  semantic_vector = generate_semantic_vector(instruction)
150
 
151
+ # Combine program vectors with instruction vector (average or concatenate, but ensure 6D)
152
+ combined_vector = semantic_vector # Use semantic vector as primary for semantic search
153
+
154
+ # Store in ChromaDB with description and combined vector
155
+ store_program(client, processed_code, sequence, [combined_vector], DB_NAME)
156
 
157
  # Update metadata with instruction and variable roles as description
158
  collection = client.get_collection(DB_NAME)
 
160
  collection.update(
161
  ids=[program_id],
162
  metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens}],
163
+ embeddings=[combined_vector] # Ensure 6D embedding
164
  )
165
 
166
  # Save to Hugging Face Dataset