Spaces:
Running
Running
Update process_hf_dataset.py
Browse files- process_hf_dataset.py +8 -5
process_hf_dataset.py
CHANGED
@@ -121,7 +121,7 @@ def generate_semantic_vector(description, total_lines=100):
|
|
121 |
return vector
|
122 |
|
123 |
def process_hf_dataset():
|
124 |
-
"""Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories."""
|
125 |
# Load the dataset
|
126 |
dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
|
127 |
|
@@ -145,11 +145,14 @@ def process_hf_dataset():
|
|
145 |
description_tokens_list = generate_description_tokens(sequence, vectors, var_map)
|
146 |
description_tokens += " " + " ".join(description_tokens_list)
|
147 |
|
148 |
-
# Generate a 6D semantic vector for the instruction
|
149 |
semantic_vector = generate_semantic_vector(instruction)
|
150 |
|
151 |
-
#
|
152 |
-
|
|
|
|
|
|
|
153 |
|
154 |
# Update metadata with instruction and variable roles as description
|
155 |
collection = client.get_collection(DB_NAME)
|
@@ -157,7 +160,7 @@ def process_hf_dataset():
|
|
157 |
collection.update(
|
158 |
ids=[program_id],
|
159 |
metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens}],
|
160 |
-
embeddings=[
|
161 |
)
|
162 |
|
163 |
# Save to Hugging Face Dataset
|
|
|
121 |
return vector
|
122 |
|
123 |
def process_hf_dataset():
|
124 |
+
"""Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories and including instruction in vectors."""
|
125 |
# Load the dataset
|
126 |
dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
|
127 |
|
|
|
145 |
description_tokens_list = generate_description_tokens(sequence, vectors, var_map)
|
146 |
description_tokens += " " + " ".join(description_tokens_list)
|
147 |
|
148 |
+
# Generate a 6D semantic vector for the instruction, incorporating it into the program vector
|
149 |
semantic_vector = generate_semantic_vector(instruction)
|
150 |
|
151 |
+
# Combine program vectors with instruction vector (average or concatenate, but ensure 6D)
|
152 |
+
combined_vector = semantic_vector # Use semantic vector as primary for semantic search
|
153 |
+
|
154 |
+
# Store in ChromaDB with description and combined vector
|
155 |
+
store_program(client, processed_code, sequence, [combined_vector], DB_NAME)
|
156 |
|
157 |
# Update metadata with instruction and variable roles as description
|
158 |
collection = client.get_collection(DB_NAME)
|
|
|
160 |
collection.update(
|
161 |
ids=[program_id],
|
162 |
metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens}],
|
163 |
+
embeddings=[combined_vector] # Ensure 6D embedding
|
164 |
)
|
165 |
|
166 |
# Save to Hugging Face Dataset
|