Spaces:
Running
Running
Update process_hf_dataset.py
Browse files- process_hf_dataset.py +34 -5
process_hf_dataset.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
# process_hf_dataset.py
|
2 |
from datasets import load_dataset
|
3 |
import re
|
4 |
-
from parser import parse_python_code
|
5 |
from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME
|
6 |
import chromadb
|
7 |
import os
|
@@ -95,6 +95,31 @@ def generate_description_tokens(sequence, vectors, var_map=None):
|
|
95 |
|
96 |
return tokens
|
97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
def process_hf_dataset():
|
99 |
"""Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories."""
|
100 |
# Load the dataset
|
@@ -111,16 +136,19 @@ def process_hf_dataset():
|
|
111 |
# Rename variables to align with vector categories
|
112 |
processed_code, var_map = rename_variables(output)
|
113 |
|
114 |
-
# Parse the code to get parts and sequence
|
115 |
parts, sequence = parse_python_code(processed_code)
|
116 |
-
vectors = [part['vector'] for part in parts]
|
117 |
|
118 |
# Generate description tokens including variable roles
|
119 |
description_tokens = f"task:{instruction.replace(' ', '_')}"
|
120 |
description_tokens_list = generate_description_tokens(sequence, vectors, var_map)
|
121 |
description_tokens += " " + " ".join(description_tokens_list)
|
122 |
|
123 |
-
#
|
|
|
|
|
|
|
124 |
store_program(client, processed_code, sequence, vectors, DB_NAME)
|
125 |
|
126 |
# Update metadata with instruction and variable roles as description
|
@@ -128,7 +156,8 @@ def process_hf_dataset():
|
|
128 |
program_id = str(hash(processed_code))
|
129 |
collection.update(
|
130 |
ids=[program_id],
|
131 |
-
metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens}]
|
|
|
132 |
)
|
133 |
|
134 |
# Save to Hugging Face Dataset
|
|
|
1 |
# process_hf_dataset.py
|
2 |
from datasets import load_dataset
|
3 |
import re
|
4 |
+
from parser import parse_python_code, create_vector
|
5 |
from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME
|
6 |
import chromadb
|
7 |
import os
|
|
|
95 |
|
96 |
return tokens
|
97 |
|
98 |
+
def generate_semantic_vector(description, total_lines=100):
|
99 |
+
"""Generate a 6D semantic vector for a textual description, matching our vector format."""
|
100 |
+
# Use a simplified heuristic to map description to our 6D vector format
|
101 |
+
category_map = {
|
102 |
+
'import': 1, 'function': 2, 'assign': 17, 'input': 18, 'return': 19, 'if': 5, 'try': 8, 'except': 14
|
103 |
+
}
|
104 |
+
|
105 |
+
# Parse description for key terms
|
106 |
+
tokens = description.lower().split()
|
107 |
+
vector = [0] * 6 # Initialize 6D vector
|
108 |
+
|
109 |
+
# Map description tokens to categories and assign basic vector values
|
110 |
+
for token in tokens:
|
111 |
+
for cat, cat_id in category_map.items():
|
112 |
+
if cat in token:
|
113 |
+
vector[0] = cat_id # category_id
|
114 |
+
vector[1] = 1 # level (assume top-level for simplicity)
|
115 |
+
vector[2] = 0.5 # center_pos (midpoint of code)
|
116 |
+
vector[3] = 0.1 # span (small for simplicity)
|
117 |
+
vector[4] = 1 # parent_depth (shallow)
|
118 |
+
vector[5] = cat_id / len(category_map) # parent_weight (normalized)
|
119 |
+
break
|
120 |
+
|
121 |
+
return vector
|
122 |
+
|
123 |
def process_hf_dataset():
|
124 |
"""Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories."""
|
125 |
# Load the dataset
|
|
|
136 |
# Rename variables to align with vector categories
|
137 |
processed_code, var_map = rename_variables(output)
|
138 |
|
139 |
+
# Parse the code to get parts and sequence, generating our 6D vectors
|
140 |
parts, sequence = parse_python_code(processed_code)
|
141 |
+
vectors = [part['vector'] for part in parts] # Use parser's 6D vectors
|
142 |
|
143 |
# Generate description tokens including variable roles
|
144 |
description_tokens = f"task:{instruction.replace(' ', '_')}"
|
145 |
description_tokens_list = generate_description_tokens(sequence, vectors, var_map)
|
146 |
description_tokens += " " + " ".join(description_tokens_list)
|
147 |
|
148 |
+
# Generate a 6D semantic vector for the instruction
|
149 |
+
semantic_vector = generate_semantic_vector(instruction)
|
150 |
+
|
151 |
+
# Store in ChromaDB with description and semantic vector
|
152 |
store_program(client, processed_code, sequence, vectors, DB_NAME)
|
153 |
|
154 |
# Update metadata with instruction and variable roles as description
|
|
|
156 |
program_id = str(hash(processed_code))
|
157 |
collection.update(
|
158 |
ids=[program_id],
|
159 |
+
metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens}],
|
160 |
+
embeddings=[semantic_vector] # Update with 6D semantic vector for semantic search
|
161 |
)
|
162 |
|
163 |
# Save to Hugging Face Dataset
|