Spaces:
Running
Running
Update process_hf_dataset.py
Browse files- process_hf_dataset.py +24 -20
process_hf_dataset.py
CHANGED
@@ -184,8 +184,12 @@ def process_hf_dataset(batch_size=100, use_gpu=False):
|
|
184 |
client = init_chromadb()
|
185 |
|
186 |
# Do not clear or populate with defaults here—let UI buttons handle this
|
187 |
-
|
188 |
-
|
|
|
|
|
|
|
|
|
189 |
|
190 |
# Process in batches with progress bar
|
191 |
total_entries = len(dataset_list)
|
@@ -246,25 +250,25 @@ def process_hf_dataset(batch_size=100, use_gpu=False):
|
|
246 |
save_chromadb_to_hf()
|
247 |
|
248 |
def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
|
249 |
-
"""Save ChromaDB data to Hugging Face Dataset."""
|
250 |
-
client = init_chromadb()
|
251 |
-
collection = client.get_collection(DB_NAME)
|
252 |
-
|
253 |
-
# Fetch all data from ChromaDB
|
254 |
-
results = collection.get(include=["documents", "metadatas", "embeddings"])
|
255 |
-
data = {
|
256 |
-
"code": results["documents"],
|
257 |
-
"sequence": [meta["sequence"] for meta in results["metadatas"]],
|
258 |
-
"vectors": results["embeddings"], # Semantic 6D vectors
|
259 |
-
"description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]],
|
260 |
-
"program_vectors": [eval(meta.get('program_vectors', '[]')) for meta in results["metadatas"]] # Store structural vectors
|
261 |
-
}
|
262 |
-
|
263 |
-
# Create a Hugging Face Dataset
|
264 |
-
dataset = Dataset.from_dict(data)
|
265 |
-
|
266 |
-
# Push to Hugging Face Hub
|
267 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
268 |
dataset.push_to_hub(dataset_name, token=token)
|
269 |
logger.info(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
|
270 |
except Exception as e:
|
|
|
184 |
client = init_chromadb()
|
185 |
|
186 |
# Do not clear or populate with defaults here—let UI buttons handle this
|
187 |
+
try:
|
188 |
+
collection = client.get_or_create_collection(DB_NAME)
|
189 |
+
logger.info(f"Using existing or new ChromaDB collection: {DB_NAME}")
|
190 |
+
except Exception as e:
|
191 |
+
logger.error(f"Error accessing ChromaDB collection: {e}")
|
192 |
+
raise
|
193 |
|
194 |
# Process in batches with progress bar
|
195 |
total_entries = len(dataset_list)
|
|
|
250 |
save_chromadb_to_hf()
|
251 |
|
252 |
def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
|
253 |
+
"""Save ChromaDB data to Hugging Face Dataset, with error handling."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
254 |
try:
|
255 |
+
client = init_chromadb()
|
256 |
+
collection = client.get_collection(DB_NAME)
|
257 |
+
|
258 |
+
# Fetch all data from ChromaDB
|
259 |
+
results = collection.get(include=["documents", "metadatas", "embeddings"])
|
260 |
+
data = {
|
261 |
+
"code": results["documents"],
|
262 |
+
"sequence": [meta["sequence"] for meta in results["metadatas"]],
|
263 |
+
"vectors": results["embeddings"], # Semantic 6D vectors
|
264 |
+
"description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]],
|
265 |
+
"program_vectors": [eval(meta.get('program_vectors', '[]')) for meta in results["metadatas"]] # Store structural vectors
|
266 |
+
}
|
267 |
+
|
268 |
+
# Create a Hugging Face Dataset
|
269 |
+
dataset = Dataset.from_dict(data)
|
270 |
+
|
271 |
+
# Push to Hugging Face Hub
|
272 |
dataset.push_to_hub(dataset_name, token=token)
|
273 |
logger.info(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
|
274 |
except Exception as e:
|