Spaces:
Running
Running
Update process_hf_dataset.py
Browse files- process_hf_dataset.py +83 -41
process_hf_dataset.py
CHANGED
@@ -8,10 +8,32 @@ import os
|
|
8 |
from dotenv import load_dotenv
|
9 |
from transformers import AutoTokenizer, AutoModel
|
10 |
import torch
|
|
|
|
|
11 |
|
12 |
# Load environment variables
|
13 |
load_dotenv()
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
def rename_variables(code, variable_prefixes=None):
|
16 |
"""Rename variables in Python code to align with vector categories (input_variable, assigned_variable, returned_variable)."""
|
17 |
if variable_prefixes is None:
|
@@ -99,11 +121,9 @@ def generate_description_tokens(sequence, vectors, var_map=None):
|
|
99 |
|
100 |
def generate_semantic_vector(description, total_lines=100, use_gpu=False):
|
101 |
"""Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D."""
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
|
106 |
-
model = AutoModel.from_pretrained(model_name).to(device)
|
107 |
|
108 |
# Tokenize and encode the description
|
109 |
inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
@@ -123,49 +143,71 @@ def generate_semantic_vector(description, total_lines=100, use_gpu=False):
|
|
123 |
|
124 |
return vector
|
125 |
|
126 |
-
def process_hf_dataset():
|
127 |
-
"""Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories
|
128 |
# Load the dataset
|
129 |
dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
|
130 |
|
131 |
# Initialize ChromaDB client
|
132 |
client = init_chromadb()
|
133 |
|
134 |
-
#
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
# Generate a 6D semantic vector for the instruction
|
152 |
-
semantic_vector = generate_semantic_vector(instruction)
|
153 |
-
|
154 |
-
# Combine program vectors with semantic vector (use semantic vector for semantic search, store program vectors separately)
|
155 |
-
# Store both semantic and program vectors, but ensure ChromaDB uses 6D
|
156 |
-
combined_vector = semantic_vector # Use semantic vector for ChromaDB embedding (6D)
|
157 |
|
158 |
-
|
159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
|
161 |
-
#
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
|
|
|
|
|
|
169 |
|
170 |
# Save to Hugging Face Dataset
|
171 |
save_chromadb_to_hf()
|
@@ -193,4 +235,4 @@ def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY"))
|
|
193 |
print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
|
194 |
|
195 |
if __name__ == "__main__":
|
196 |
-
process_hf_dataset()
|
|
|
8 |
from dotenv import load_dotenv
|
9 |
from transformers import AutoTokenizer, AutoModel
|
10 |
import torch
|
11 |
+
from tqdm import tqdm # For progress bar
|
12 |
+
import time
|
13 |
|
14 |
# Load environment variables
|
15 |
load_dotenv()
|
16 |
|
17 |
+
# Cache CodeBERT model globally to avoid repeated loading
|
18 |
+
model_name = "microsoft/codebert-base"
|
19 |
+
tokenizer = None
|
20 |
+
model = None
|
21 |
+
device = None
|
22 |
+
|
23 |
+
def load_codebert_model(use_gpu=False):
|
24 |
+
"""Load and cache the CodeBERT model, handling GPU/CPU options."""
|
25 |
+
global tokenizer, model, device
|
26 |
+
if tokenizer is None or model is None:
|
27 |
+
try:
|
28 |
+
device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
|
29 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
30 |
+
model = AutoModel.from_pretrained(model_name).to(device)
|
31 |
+
print(f"CodeBERT model loaded on {device}")
|
32 |
+
except Exception as e:
|
33 |
+
print(f"Error loading CodeBERT model: {e}")
|
34 |
+
raise
|
35 |
+
return tokenizer, model, device
|
36 |
+
|
37 |
def rename_variables(code, variable_prefixes=None):
|
38 |
"""Rename variables in Python code to align with vector categories (input_variable, assigned_variable, returned_variable)."""
|
39 |
if variable_prefixes is None:
|
|
|
121 |
|
122 |
def generate_semantic_vector(description, total_lines=100, use_gpu=False):
|
123 |
"""Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D."""
|
124 |
+
global tokenizer, model, device
|
125 |
+
if tokenizer is None or model is None:
|
126 |
+
tokenizer, model, device = load_codebert_model(use_gpu)
|
|
|
|
|
127 |
|
128 |
# Tokenize and encode the description
|
129 |
inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
|
|
143 |
|
144 |
return vector
|
145 |
|
146 |
+
def process_hf_dataset(batch_size=100, use_gpu=False):
|
147 |
+
"""Process the Hugging Face dataset in batches and store programs in ChromaDB, aligning with vector categories."""
|
148 |
# Load the dataset
|
149 |
dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
|
150 |
|
151 |
# Initialize ChromaDB client
|
152 |
client = init_chromadb()
|
153 |
|
154 |
+
# Clear existing collection (fresh install) if needed
|
155 |
+
try:
|
156 |
+
client.delete_collection(DB_NAME)
|
157 |
+
except:
|
158 |
+
pass # Collection may not exist
|
159 |
+
collection = client.create_collection(DB_NAME)
|
160 |
+
|
161 |
+
# Process in batches with progress bar
|
162 |
+
total_entries = len(dataset)
|
163 |
+
for i in tqdm(range(0, total_entries, batch_size), desc="Processing Hugging Face Dataset"):
|
164 |
+
batch = dataset[i:i + batch_size]
|
165 |
+
batch_programs = []
|
166 |
+
batch_ids = []
|
167 |
+
batch_documents = []
|
168 |
+
batch_metadatas = []
|
169 |
+
batch_embeddings = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
|
171 |
+
for entry in batch:
|
172 |
+
instruction = entry['instruction']
|
173 |
+
output = entry['output']
|
174 |
+
|
175 |
+
# Rename variables to align with vector categories
|
176 |
+
processed_code, var_map = rename_variables(output)
|
177 |
+
|
178 |
+
# Parse the code to get parts and sequence, generating our 6D vectors
|
179 |
+
parts, sequence = parse_python_code(processed_code)
|
180 |
+
program_vectors = [part['vector'] for part in parts] # Use parser's 6D vectors for program structure
|
181 |
+
|
182 |
+
# Generate description tokens including variable roles
|
183 |
+
description_tokens = f"task:{instruction.replace(' ', '_')}"
|
184 |
+
description_tokens_list = generate_description_tokens(sequence, program_vectors, var_map)
|
185 |
+
description_tokens += " " + " ".join(description_tokens_list)
|
186 |
+
|
187 |
+
# Generate a 6D semantic vector for the instruction
|
188 |
+
semantic_vector = generate_semantic_vector(instruction, use_gpu=use_gpu)
|
189 |
+
|
190 |
+
# Store program data
|
191 |
+
program_id = str(hash(processed_code))
|
192 |
+
batch_ids.append(program_id)
|
193 |
+
batch_documents.append(processed_code)
|
194 |
+
batch_metadatas.append({"sequence": ",".join(sequence), "description_tokens": description_tokens, "program_vectors": str(program_vectors)})
|
195 |
+
batch_embeddings.append(semantic_vector)
|
196 |
+
|
197 |
+
# Add small delay to prevent freezing (optional, adjust as needed)
|
198 |
+
time.sleep(0.01)
|
199 |
|
200 |
+
# Batch add to ChromaDB
|
201 |
+
try:
|
202 |
+
collection.add(
|
203 |
+
documents=batch_documents,
|
204 |
+
metadatas=batch_metadatas,
|
205 |
+
ids=batch_ids,
|
206 |
+
embeddings=batch_embeddings
|
207 |
+
)
|
208 |
+
except Exception as e:
|
209 |
+
print(f"Error adding batch to ChromaDB: {e}")
|
210 |
+
raise
|
211 |
|
212 |
# Save to Hugging Face Dataset
|
213 |
save_chromadb_to_hf()
|
|
|
235 |
print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
|
236 |
|
237 |
if __name__ == "__main__":
|
238 |
+
process_hf_dataset(batch_size=100, use_gpu=False)
|