broadfield-dev commited on
Commit
ae9ce58
·
verified ·
1 Parent(s): f518567

Update process_hf_dataset.py

Browse files
Files changed (1) hide show
  1. process_hf_dataset.py +24 -20
process_hf_dataset.py CHANGED
@@ -184,8 +184,12 @@ def process_hf_dataset(batch_size=100, use_gpu=False):
184
  client = init_chromadb()
185
 
186
  # Do not clear or populate with defaults here—let UI buttons handle this
187
- collection = client.get_or_create_collection(DB_NAME)
188
- logger.info(f"Using existing or new ChromaDB collection: {DB_NAME}")
 
 
 
 
189
 
190
  # Process in batches with progress bar
191
  total_entries = len(dataset_list)
@@ -246,25 +250,25 @@ def process_hf_dataset(batch_size=100, use_gpu=False):
246
  save_chromadb_to_hf()
247
 
248
  def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
249
- """Save ChromaDB data to Hugging Face Dataset."""
250
- client = init_chromadb()
251
- collection = client.get_collection(DB_NAME)
252
-
253
- # Fetch all data from ChromaDB
254
- results = collection.get(include=["documents", "metadatas", "embeddings"])
255
- data = {
256
- "code": results["documents"],
257
- "sequence": [meta["sequence"] for meta in results["metadatas"]],
258
- "vectors": results["embeddings"], # Semantic 6D vectors
259
- "description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]],
260
- "program_vectors": [eval(meta.get('program_vectors', '[]')) for meta in results["metadatas"]] # Store structural vectors
261
- }
262
-
263
- # Create a Hugging Face Dataset
264
- dataset = Dataset.from_dict(data)
265
-
266
- # Push to Hugging Face Hub
267
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  dataset.push_to_hub(dataset_name, token=token)
269
  logger.info(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
270
  except Exception as e:
 
184
  client = init_chromadb()
185
 
186
  # Do not clear or populate with defaults here—let UI buttons handle this
187
+ try:
188
+ collection = client.get_or_create_collection(DB_NAME)
189
+ logger.info(f"Using existing or new ChromaDB collection: {DB_NAME}")
190
+ except Exception as e:
191
+ logger.error(f"Error accessing ChromaDB collection: {e}")
192
+ raise
193
 
194
  # Process in batches with progress bar
195
  total_entries = len(dataset_list)
 
250
  save_chromadb_to_hf()
251
 
252
  def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
253
+ """Save ChromaDB data to Hugging Face Dataset, with error handling."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  try:
255
+ client = init_chromadb()
256
+ collection = client.get_collection(DB_NAME)
257
+
258
+ # Fetch all data from ChromaDB
259
+ results = collection.get(include=["documents", "metadatas", "embeddings"])
260
+ data = {
261
+ "code": results["documents"],
262
+ "sequence": [meta["sequence"] for meta in results["metadatas"]],
263
+ "vectors": results["embeddings"], # Semantic 6D vectors
264
+ "description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]],
265
+ "program_vectors": [eval(meta.get('program_vectors', '[]')) for meta in results["metadatas"]] # Store structural vectors
266
+ }
267
+
268
+ # Create a Hugging Face Dataset
269
+ dataset = Dataset.from_dict(data)
270
+
271
+ # Push to Hugging Face Hub
272
  dataset.push_to_hub(dataset_name, token=token)
273
  logger.info(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
274
  except Exception as e: