Spaces:
Running
Running
Update process_hf_dataset.py
Browse files- process_hf_dataset.py +12 -2
process_hf_dataset.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
from datasets import load_dataset
|
3 |
import re
|
4 |
from parser import parse_python_code, create_vector
|
5 |
-
from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME
|
6 |
import chromadb
|
7 |
import os
|
8 |
from dotenv import load_dotenv
|
@@ -187,6 +187,10 @@ def process_hf_dataset(batch_size=100, use_gpu=False):
|
|
187 |
try:
|
188 |
collection = client.get_or_create_collection(DB_NAME)
|
189 |
logger.info(f"Using existing or new ChromaDB collection: {DB_NAME}")
|
|
|
|
|
|
|
|
|
190 |
except Exception as e:
|
191 |
logger.error(f"Error accessing ChromaDB collection: {e}")
|
192 |
raise
|
@@ -242,6 +246,9 @@ def process_hf_dataset(batch_size=100, use_gpu=False):
|
|
242 |
embeddings=batch_embeddings
|
243 |
)
|
244 |
logger.info(f"Added batch {i//batch_size + 1} to ChromaDB with {len(batch_ids)} entries")
|
|
|
|
|
|
|
245 |
except Exception as e:
|
246 |
logger.error(f"Error adding batch to ChromaDB: {e}")
|
247 |
raise
|
@@ -250,7 +257,7 @@ def process_hf_dataset(batch_size=100, use_gpu=False):
|
|
250 |
save_chromadb_to_hf()
|
251 |
|
252 |
def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
|
253 |
-
"""Save ChromaDB data to Hugging Face Dataset, with error handling."""
|
254 |
try:
|
255 |
client = init_chromadb()
|
256 |
collection = client.get_collection(DB_NAME)
|
@@ -267,10 +274,13 @@ def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY"))
|
|
267 |
|
268 |
# Create a Hugging Face Dataset
|
269 |
dataset = Dataset.from_dict(data)
|
|
|
270 |
|
271 |
# Push to Hugging Face Hub
|
272 |
dataset.push_to_hub(dataset_name, token=token)
|
273 |
logger.info(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
|
|
|
|
|
274 |
except Exception as e:
|
275 |
logger.error(f"Error pushing dataset to Hugging Face Hub: {e}")
|
276 |
raise
|
|
|
2 |
from datasets import load_dataset
|
3 |
import re
|
4 |
from parser import parse_python_code, create_vector
|
5 |
+
from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME, create_collection
|
6 |
import chromadb
|
7 |
import os
|
8 |
from dotenv import load_dotenv
|
|
|
187 |
try:
|
188 |
collection = client.get_or_create_collection(DB_NAME)
|
189 |
logger.info(f"Using existing or new ChromaDB collection: {DB_NAME}")
|
190 |
+
# Verify collection is valid
|
191 |
+
if collection is None or not hasattr(collection, 'add'):
|
192 |
+
raise ValueError("ChromaDB collection access failed")
|
193 |
+
logger.info(f"ChromaDB collection verified, contains {collection.count()} entries")
|
194 |
except Exception as e:
|
195 |
logger.error(f"Error accessing ChromaDB collection: {e}")
|
196 |
raise
|
|
|
246 |
embeddings=batch_embeddings
|
247 |
)
|
248 |
logger.info(f"Added batch {i//batch_size + 1} to ChromaDB with {len(batch_ids)} entries")
|
249 |
+
# Verify addition
|
250 |
+
count = collection.count()
|
251 |
+
logger.info(f"ChromaDB now contains {count} entries after adding batch")
|
252 |
except Exception as e:
|
253 |
logger.error(f"Error adding batch to ChromaDB: {e}")
|
254 |
raise
|
|
|
257 |
save_chromadb_to_hf()
|
258 |
|
259 |
def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
|
260 |
+
"""Save ChromaDB data to Hugging Face Dataset, with error handling and logging."""
|
261 |
try:
|
262 |
client = init_chromadb()
|
263 |
collection = client.get_collection(DB_NAME)
|
|
|
274 |
|
275 |
# Create a Hugging Face Dataset
|
276 |
dataset = Dataset.from_dict(data)
|
277 |
+
logger.info(f"Created Hugging Face Dataset with {len(data['code'])} entries")
|
278 |
|
279 |
# Push to Hugging Face Hub
|
280 |
dataset.push_to_hub(dataset_name, token=token)
|
281 |
logger.info(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
|
282 |
+
# Verify push (optional, could check dataset on Hub)
|
283 |
+
logger.info(f"Verified Hugging Face dataset push with {len(dataset)} entries")
|
284 |
except Exception as e:
|
285 |
logger.error(f"Error pushing dataset to Hugging Face Hub: {e}")
|
286 |
raise
|