Spaces:
Running
Running
# save_to_hf.py | |
from datasets import Dataset | |
import chromadb | |
from database import init_chromadb, create_collection | |
def save_chromadb_to_hf(dataset_name="python_program_vectors"): | |
client = init_chromadb() | |
collection = create_collection(client) | |
# Fetch all data from ChromaDB | |
results = collection.get(include=["documents", "metadatas", "embeddings"]) | |
data = { | |
"code": results["documents"], | |
"sequence": [meta["sequence"] for meta in results["metadatas"]], | |
"vectors": results["embeddings"] | |
} | |
# Create a Hugging Face Dataset | |
dataset = Dataset.from_dict(data) | |
# Push to Hugging Face Hub | |
dataset.push_to_hub(dataset_name, token="YOUR_HUGGINGFACE_TOKEN") | |
print(f"Dataset pushed to Hugging Face Hub as {dataset_name}") | |
if __name__ == "__main__": | |
save_chromadb_to_hf() |