Spaces:
Running
Running
Update process_hf_dataset.py
Browse files- process_hf_dataset.py +54 -36
process_hf_dataset.py
CHANGED
@@ -10,6 +10,11 @@ from transformers import AutoTokenizer, AutoModel
|
|
10 |
import torch
|
11 |
from tqdm import tqdm # For progress bar
|
12 |
import time
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
# Load environment variables
|
15 |
load_dotenv()
|
@@ -28,9 +33,9 @@ def load_codebert_model(use_gpu=False):
|
|
28 |
device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
|
29 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
30 |
model = AutoModel.from_pretrained(model_name).to(device)
|
31 |
-
|
32 |
except Exception as e:
|
33 |
-
|
34 |
raise
|
35 |
return tokenizer, model, device
|
36 |
|
@@ -143,6 +148,7 @@ def generate_semantic_vector(description, total_lines=100, use_gpu=False):
|
|
143 |
|
144 |
# Ensure vector isn’t all zeros or defaults
|
145 |
if all(v == 0 for v in vector):
|
|
|
146 |
# Fallback: Use heuristic if CodeBERT fails to generate meaningful embeddings
|
147 |
category_map = {
|
148 |
'import': 1, 'function': 2, 'assign': 17, 'input': 18, 'return': 19, 'if': 5, 'try': 8, 'except': 14
|
@@ -160,15 +166,19 @@ def generate_semantic_vector(description, total_lines=100, use_gpu=False):
|
|
160 |
vector[5] = cat_id / len(category_map) # parent_weight
|
161 |
break
|
162 |
|
|
|
163 |
return vector
|
164 |
|
165 |
def process_hf_dataset(batch_size=100, use_gpu=False):
|
166 |
"""Process the Hugging Face dataset in batches and store programs in ChromaDB, aligning with vector categories."""
|
167 |
# Load the dataset
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
|
|
|
|
|
|
172 |
|
173 |
# Initialize ChromaDB client
|
174 |
client = init_chromadb()
|
@@ -187,33 +197,36 @@ def process_hf_dataset(batch_size=100, use_gpu=False):
|
|
187 |
batch_embeddings = []
|
188 |
|
189 |
for entry in batch:
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
|
|
|
|
|
|
217 |
|
218 |
# Batch add to ChromaDB
|
219 |
try:
|
@@ -223,8 +236,9 @@ def process_hf_dataset(batch_size=100, use_gpu=False):
|
|
223 |
ids=batch_ids,
|
224 |
embeddings=batch_embeddings
|
225 |
)
|
|
|
226 |
except Exception as e:
|
227 |
-
|
228 |
raise
|
229 |
|
230 |
# Save to Hugging Face Dataset
|
@@ -249,8 +263,12 @@ def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY"))
|
|
249 |
dataset = Dataset.from_dict(data)
|
250 |
|
251 |
# Push to Hugging Face Hub
|
252 |
-
|
253 |
-
|
|
|
|
|
|
|
|
|
254 |
|
255 |
if __name__ == "__main__":
|
256 |
process_hf_dataset(batch_size=100, use_gpu=False)
|
|
|
10 |
import torch
|
11 |
from tqdm import tqdm # For progress bar
|
12 |
import time
|
13 |
+
import logging
|
14 |
+
|
15 |
+
# Set up logging
|
16 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
17 |
+
logger = logging.getLogger(__name__)
|
18 |
|
19 |
# Load environment variables
|
20 |
load_dotenv()
|
|
|
33 |
device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
|
34 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
35 |
model = AutoModel.from_pretrained(model_name).to(device)
|
36 |
+
logger.info(f"CodeBERT model loaded on {device}")
|
37 |
except Exception as e:
|
38 |
+
logger.error(f"Error loading CodeBERT model: {e}")
|
39 |
raise
|
40 |
return tokenizer, model, device
|
41 |
|
|
|
148 |
|
149 |
# Ensure vector isn’t all zeros or defaults
|
150 |
if all(v == 0 for v in vector):
|
151 |
+
logger.warning(f"Default vector detected for description: {description}")
|
152 |
# Fallback: Use heuristic if CodeBERT fails to generate meaningful embeddings
|
153 |
category_map = {
|
154 |
'import': 1, 'function': 2, 'assign': 17, 'input': 18, 'return': 19, 'if': 5, 'try': 8, 'except': 14
|
|
|
166 |
vector[5] = cat_id / len(category_map) # parent_weight
|
167 |
break
|
168 |
|
169 |
+
logger.debug(f"Generated semantic vector for '{description}': {vector}")
|
170 |
return vector
|
171 |
|
172 |
def process_hf_dataset(batch_size=100, use_gpu=False):
|
173 |
"""Process the Hugging Face dataset in batches and store programs in ChromaDB, aligning with vector categories."""
|
174 |
# Load the dataset
|
175 |
+
try:
|
176 |
+
dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
|
177 |
+
dataset_list = list(dataset)
|
178 |
+
logger.info(f"Loaded dataset with {len(dataset_list)} entries")
|
179 |
+
except Exception as e:
|
180 |
+
logger.error(f"Error loading dataset: {e}")
|
181 |
+
raise
|
182 |
|
183 |
# Initialize ChromaDB client
|
184 |
client = init_chromadb()
|
|
|
197 |
batch_embeddings = []
|
198 |
|
199 |
for entry in batch:
|
200 |
+
try:
|
201 |
+
instruction = entry['instruction']
|
202 |
+
output = entry['output']
|
203 |
+
|
204 |
+
# Rename variables to align with vector categories
|
205 |
+
processed_code, var_map = rename_variables(output)
|
206 |
+
|
207 |
+
# Parse the code to get parts and sequence, generating our 6D vectors
|
208 |
+
parts, sequence = parse_python_code(processed_code)
|
209 |
+
program_vectors = [part['vector'] for part in parts] # Use parser's 6D vectors for program structure
|
210 |
+
|
211 |
+
# Generate description tokens including variable roles
|
212 |
+
description_tokens = f"task:{instruction.replace(' ', '_')}"
|
213 |
+
description_tokens_list = generate_description_tokens(sequence, program_vectors, var_map)
|
214 |
+
description_tokens += " " + " ".join(description_tokens_list)
|
215 |
+
|
216 |
+
# Generate a 6D semantic vector for the instruction
|
217 |
+
semantic_vector = generate_semantic_vector(instruction, use_gpu=use_gpu)
|
218 |
+
|
219 |
+
# Store program data
|
220 |
+
program_id = str(hash(processed_code))
|
221 |
+
batch_ids.append(program_id)
|
222 |
+
batch_documents.append(processed_code)
|
223 |
+
batch_metadatas.append({"sequence": ",".join(sequence), "description_tokens": description_tokens, "program_vectors": str(program_vectors)})
|
224 |
+
batch_embeddings.append(semantic_vector)
|
225 |
+
|
226 |
+
logger.debug(f"Processed entry: {program_id}, Vector: {semantic_vector}")
|
227 |
+
except Exception as e:
|
228 |
+
logger.error(f"Error processing entry {i}: {e}")
|
229 |
+
continue # Skip failed entries but continue processing
|
230 |
|
231 |
# Batch add to ChromaDB
|
232 |
try:
|
|
|
236 |
ids=batch_ids,
|
237 |
embeddings=batch_embeddings
|
238 |
)
|
239 |
+
logger.info(f"Added batch {i//batch_size + 1} to ChromaDB with {len(batch_ids)} entries")
|
240 |
except Exception as e:
|
241 |
+
logger.error(f"Error adding batch to ChromaDB: {e}")
|
242 |
raise
|
243 |
|
244 |
# Save to Hugging Face Dataset
|
|
|
263 |
dataset = Dataset.from_dict(data)
|
264 |
|
265 |
# Push to Hugging Face Hub
|
266 |
+
try:
|
267 |
+
dataset.push_to_hub(dataset_name, token=token)
|
268 |
+
logger.info(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
|
269 |
+
except Exception as e:
|
270 |
+
logger.error(f"Error pushing dataset to Hugging Face Hub: {e}")
|
271 |
+
raise
|
272 |
|
273 |
if __name__ == "__main__":
|
274 |
process_hf_dataset(batch_size=100, use_gpu=False)
|