Spaces:
Running
Running
# src/utils/database_cleanup.py | |
import chromadb | |
import shutil | |
from pathlib import Path | |
import asyncio | |
import gc | |
import random | |
from typing import List, Dict, Tuple | |
from src.utils.logger import logger | |
from config.config import settings | |
async def cleanup_chroma() -> Tuple[List[str], bool]: | |
"""Clean up ChromaDB data while maintaining connection""" | |
details = [] | |
restart_needed = False | |
try: | |
# Get existing client | |
client = chromadb.PersistentClient( | |
path=settings.CHROMA_PATH, | |
settings=chromadb.Settings( | |
allow_reset=True, | |
is_persistent=True, | |
anonymized_telemetry=False | |
) | |
) | |
# Get all collections | |
collections = client.list_collections() | |
if not collections: | |
details.append("No collections found in ChromaDB") | |
return details, restart_needed | |
# Delete data from each collection | |
for collection in collections: | |
try: | |
# Get all IDs in the collection | |
all_ids = collection.get()['ids'] | |
if all_ids: | |
# Delete all documents in the collection | |
collection.delete(ids=all_ids) | |
details.append( | |
f"Deleted {len(all_ids)} documents from collection {collection.name}") | |
else: | |
details.append( | |
f"Collection {collection.name} was already empty") | |
# Delete the collection itself | |
client.delete_collection(collection.name) | |
details.append(f"Deleted collection {collection.name}") | |
except Exception as e: | |
logger.warning( | |
f"Error cleaning collection {collection.name}: {str(e)}") | |
details.append( | |
f"Error cleaning collection {collection.name}: {str(e)}") | |
restart_needed = True # Set restart flag if any collection fails | |
# Optional: Add a check to see if a full reset might be necessary | |
if len(client.list_collections()) > 0: | |
restart_needed = True | |
details.append("Some collections might require manual reset") | |
return details, restart_needed | |
except Exception as e: | |
raise Exception(f"ChromaDB cleanup failed: {str(e)}") | |
async def cleanup_mongodb(mongodb) -> List[str]: | |
"""Clean up MongoDB collections""" | |
details = [] | |
try: | |
# Get all collections in the database | |
collections = await mongodb.db.list_collection_names() | |
# Core collections from MongoDBStore initialization | |
core_collections = { | |
'chat_history': mongodb.chat_history, | |
'conversations': mongodb.conversations, | |
'knowledge_base': mongodb.documents, # documents maps to knowledge_base | |
# Direct access to vector_metadata collection | |
'vector_metadata': mongodb.db.vector_metadata, | |
} | |
# Clean each core collection | |
for name, collection in core_collections.items(): | |
try: | |
result = await collection.delete_many({}) | |
details.append( | |
f"Cleared {name} ({result.deleted_count} documents)") | |
except Exception as e: | |
logger.error(f"Error clearing {name}: {str(e)}") | |
details.append(f"Error clearing {name}: {str(e)}") | |
# Clean any additional collections not in the core set | |
for coll_name in collections: | |
if coll_name not in core_collections: | |
try: | |
result = await mongodb.db[coll_name].delete_many({}) | |
details.append( | |
f"Cleared additional collection {coll_name} ({result.deleted_count} documents)") | |
except Exception as e: | |
logger.error( | |
f"Error clearing additional collection {coll_name}: {str(e)}") | |
return details | |
except Exception as e: | |
raise Exception(f"MongoDB cleanup failed: {str(e)}") | |
async def cleanup_files() -> List[str]: | |
"""Clean up uploaded files and temporary directories""" | |
details = [] | |
# Directories to clean | |
directories = { | |
'uploads': Path("uploads"), | |
'temp_downloads': Path(settings.TEMP_DOWNLOAD_DIR), | |
# Additional temp directory used by some components | |
'temp_dir': Path('./temp') | |
} | |
for dir_name, dir_path in directories.items(): | |
if dir_path.exists(): | |
try: | |
# Delete all files in the directory | |
for file in dir_path.glob('*'): | |
try: | |
if file.is_file(): | |
file.unlink() | |
details.append( | |
f"Deleted file: {file.name} from {dir_name}") | |
except Exception as e: | |
details.append( | |
f"Error deleting file {file.name} from {dir_name}: {str(e)}") | |
# Try to remove the empty directory | |
if not any(dir_path.iterdir()): | |
dir_path.rmdir() | |
details.append(f"Removed empty {dir_name} directory") | |
except Exception as e: | |
details.append( | |
f"Error cleaning {dir_name} directory: {str(e)}") | |
else: | |
details.append(f"No {dir_name} directory found") | |
return details | |
async def perform_cleanup(mongodb, include_files: bool = True) -> Dict: | |
""" | |
Perform comprehensive cleanup of all databases and files | |
Args: | |
mongodb: MongoDB store instance | |
include_files (bool): Whether to also delete uploaded files | |
Returns: | |
Dict: Cleanup operation summary with detailed status | |
""" | |
cleanup_summary = { | |
"chroma_db": {"status": "not_started", "details": []}, | |
"mongodb": {"status": "not_started", "details": []}, | |
"files": {"status": "not_started", "details": []} | |
} | |
try: | |
# Clean ChromaDB | |
try: | |
details, restart_needed = await cleanup_chroma() | |
cleanup_summary["chroma_db"] = { | |
"status": "success" if not restart_needed else "partial", | |
"details": details | |
} | |
except Exception as e: | |
logger.error(f"Error cleaning ChromaDB: {str(e)}") | |
cleanup_summary["chroma_db"] = { | |
"status": "error", | |
"details": [str(e)] | |
} | |
# Clean MongoDB | |
try: | |
details = await cleanup_mongodb(mongodb) | |
cleanup_summary["mongodb"] = { | |
"status": "success", | |
"details": details | |
} | |
except Exception as e: | |
logger.error(f"Error cleaning MongoDB: {str(e)}") | |
cleanup_summary["mongodb"] = { | |
"status": "error", | |
"details": [str(e)] | |
} | |
# Clean files if requested | |
if include_files: | |
try: | |
details = await cleanup_files() | |
cleanup_summary["files"] = { | |
"status": "success", | |
"details": details | |
} | |
except Exception as e: | |
logger.error(f"Error cleaning files: {str(e)}") | |
cleanup_summary["files"] = { | |
"status": "error", | |
"details": [str(e)] | |
} | |
# Determine overall status | |
overall_status = "success" | |
if restart_needed: | |
overall_status = "partial_success" | |
cleanup_summary["message"] = "Cleanup partially completed. Server restart required to complete ChromaDB cleanup." | |
elif any(item["status"] == "error" for item in cleanup_summary.values()): | |
overall_status = "partial_success" | |
elif all(item["status"] == "error" for item in cleanup_summary.values()): | |
overall_status = "error" | |
return { | |
"status": overall_status, | |
"message": cleanup_summary.get("message", "Cleanup operation completed"), | |
"details": cleanup_summary, | |
"restart_needed": restart_needed | |
} | |
except Exception as e: | |
logger.error(f"Error in cleanup operation: {str(e)}") | |
raise | |