broadfield-dev commited on
Commit
fda0a3e
·
verified ·
1 Parent(s): 04dce48

Update process_hf_dataset.py

Browse files
Files changed (1) hide show
  1. process_hf_dataset.py +12 -2
process_hf_dataset.py CHANGED
@@ -2,7 +2,7 @@
2
  from datasets import load_dataset
3
  import re
4
  from parser import parse_python_code, create_vector
5
- from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME
6
  import chromadb
7
  import os
8
  from dotenv import load_dotenv
@@ -187,6 +187,10 @@ def process_hf_dataset(batch_size=100, use_gpu=False):
187
  try:
188
  collection = client.get_or_create_collection(DB_NAME)
189
  logger.info(f"Using existing or new ChromaDB collection: {DB_NAME}")
 
 
 
 
190
  except Exception as e:
191
  logger.error(f"Error accessing ChromaDB collection: {e}")
192
  raise
@@ -242,6 +246,9 @@ def process_hf_dataset(batch_size=100, use_gpu=False):
242
  embeddings=batch_embeddings
243
  )
244
  logger.info(f"Added batch {i//batch_size + 1} to ChromaDB with {len(batch_ids)} entries")
 
 
 
245
  except Exception as e:
246
  logger.error(f"Error adding batch to ChromaDB: {e}")
247
  raise
@@ -250,7 +257,7 @@ def process_hf_dataset(batch_size=100, use_gpu=False):
250
  save_chromadb_to_hf()
251
 
252
  def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
253
- """Save ChromaDB data to Hugging Face Dataset, with error handling."""
254
  try:
255
  client = init_chromadb()
256
  collection = client.get_collection(DB_NAME)
@@ -267,10 +274,13 @@ def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY"))
267
 
268
  # Create a Hugging Face Dataset
269
  dataset = Dataset.from_dict(data)
 
270
 
271
  # Push to Hugging Face Hub
272
  dataset.push_to_hub(dataset_name, token=token)
273
  logger.info(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
 
 
274
  except Exception as e:
275
  logger.error(f"Error pushing dataset to Hugging Face Hub: {e}")
276
  raise
 
2
  from datasets import load_dataset
3
  import re
4
  from parser import parse_python_code, create_vector
5
+ from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME, create_collection
6
  import chromadb
7
  import os
8
  from dotenv import load_dotenv
 
187
  try:
188
  collection = client.get_or_create_collection(DB_NAME)
189
  logger.info(f"Using existing or new ChromaDB collection: {DB_NAME}")
190
+ # Verify collection is valid
191
+ if collection is None or not hasattr(collection, 'add'):
192
+ raise ValueError("ChromaDB collection access failed")
193
+ logger.info(f"ChromaDB collection verified, contains {collection.count()} entries")
194
  except Exception as e:
195
  logger.error(f"Error accessing ChromaDB collection: {e}")
196
  raise
 
246
  embeddings=batch_embeddings
247
  )
248
  logger.info(f"Added batch {i//batch_size + 1} to ChromaDB with {len(batch_ids)} entries")
249
+ # Verify addition
250
+ count = collection.count()
251
+ logger.info(f"ChromaDB now contains {count} entries after adding batch")
252
  except Exception as e:
253
  logger.error(f"Error adding batch to ChromaDB: {e}")
254
  raise
 
257
  save_chromadb_to_hf()
258
 
259
  def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
260
+ """Save ChromaDB data to Hugging Face Dataset, with error handling and logging."""
261
  try:
262
  client = init_chromadb()
263
  collection = client.get_collection(DB_NAME)
 
274
 
275
  # Create a Hugging Face Dataset
276
  dataset = Dataset.from_dict(data)
277
+ logger.info(f"Created Hugging Face Dataset with {len(data['code'])} entries")
278
 
279
  # Push to Hugging Face Hub
280
  dataset.push_to_hub(dataset_name, token=token)
281
  logger.info(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
282
+ # Verify push (optional, could check dataset on Hub)
283
+ logger.info(f"Verified Hugging Face dataset push with {len(dataset)} entries")
284
  except Exception as e:
285
  logger.error(f"Error pushing dataset to Hugging Face Hub: {e}")
286
  raise