Spaces:
Configuration error
Configuration error
import asyncio | |
from datetime import datetime, timedelta, timezone | |
from typing import Optional | |
from litellm._logging import verbose_proxy_logger | |
from litellm.caching import RedisCache | |
from litellm.constants import ( | |
SPEND_LOG_CLEANUP_BATCH_SIZE, | |
SPEND_LOG_CLEANUP_JOB_NAME, | |
SPEND_LOG_RUN_LOOPS, | |
) | |
from litellm.litellm_core_utils.duration_parser import duration_in_seconds | |
from litellm.proxy.utils import PrismaClient | |
class SpendLogCleanup: | |
""" | |
Handles cleaning up old spend logs based on maximum retention period. | |
Deletes logs in batches to prevent timeouts. | |
Uses PodLockManager to ensure only one pod runs cleanup in multi-pod deployments. | |
""" | |
def __init__(self, general_settings=None, redis_cache: Optional[RedisCache] = None): | |
self.batch_size = SPEND_LOG_CLEANUP_BATCH_SIZE | |
self.retention_seconds: Optional[int] = None | |
from litellm.proxy.proxy_server import general_settings as default_settings | |
self.general_settings = general_settings or default_settings | |
from litellm.proxy.proxy_server import proxy_logging_obj | |
pod_lock_manager = proxy_logging_obj.db_spend_update_writer.pod_lock_manager | |
self.pod_lock_manager = pod_lock_manager | |
verbose_proxy_logger.info( | |
f"SpendLogCleanup initialized with batch size: {self.batch_size}" | |
) | |
def _should_delete_spend_logs(self) -> bool: | |
""" | |
Determines if logs should be deleted based on the max retention period in settings. | |
""" | |
retention_setting = self.general_settings.get( | |
"maximum_spend_logs_retention_period" | |
) | |
verbose_proxy_logger.info(f"Checking retention setting: {retention_setting}") | |
if retention_setting is None: | |
verbose_proxy_logger.info("No retention setting found") | |
return False | |
try: | |
if isinstance(retention_setting, int): | |
retention_setting = str(retention_setting) | |
self.retention_seconds = duration_in_seconds(retention_setting) | |
verbose_proxy_logger.info( | |
f"Retention period set to {self.retention_seconds} seconds" | |
) | |
return True | |
except ValueError as e: | |
verbose_proxy_logger.error( | |
f"Invalid maximum_spend_logs_retention_period value: {retention_setting}, error: {str(e)}" | |
) | |
return False | |
async def _delete_old_logs( | |
self, prisma_client: PrismaClient, cutoff_date: datetime | |
) -> int: | |
""" | |
Helper method to delete old logs in batches. | |
Returns the total number of logs deleted. | |
""" | |
total_deleted = 0 | |
run_count = 0 | |
while True: | |
if run_count > SPEND_LOG_RUN_LOOPS: | |
verbose_proxy_logger.info( | |
"Max logs deleted - 1,00,000, rest of the logs will be deleted in next run" | |
) | |
break | |
# Step 1: Find logs to delete | |
logs_to_delete = await prisma_client.db.litellm_spendlogs.find_many( | |
where={"startTime": {"lt": cutoff_date}}, | |
take=self.batch_size, | |
) | |
verbose_proxy_logger.info(f"Found {len(logs_to_delete)} logs in this batch") | |
if not logs_to_delete: | |
verbose_proxy_logger.info( | |
f"No more logs to delete. Total deleted: {total_deleted}" | |
) | |
break | |
request_ids = [log.request_id for log in logs_to_delete] | |
# Step 2: Delete them in one go | |
await prisma_client.db.litellm_spendlogs.delete_many( | |
where={"request_id": {"in": request_ids}} | |
) | |
total_deleted += len(logs_to_delete) | |
run_count += 1 | |
# Add a small sleep to prevent overwhelming the database | |
await asyncio.sleep(0.1) | |
return total_deleted | |
async def cleanup_old_spend_logs(self, prisma_client: PrismaClient) -> None: | |
""" | |
Main cleanup function. Deletes old spend logs in batches. | |
If pod_lock_manager is available, ensures only one pod runs cleanup. | |
If no pod_lock_manager, runs cleanup without distributed locking. | |
""" | |
try: | |
verbose_proxy_logger.info(f"Cleanup job triggered at {datetime.now()}") | |
if not self._should_delete_spend_logs(): | |
verbose_proxy_logger.info( | |
"Skipping cleanup — invalid or missing retention setting." | |
) | |
return | |
if self.retention_seconds is None: | |
verbose_proxy_logger.error( | |
"Retention seconds is None, cannot proceed with cleanup" | |
) | |
return | |
# If we have a pod lock manager, try to acquire the lock | |
if self.pod_lock_manager and self.pod_lock_manager.redis_cache: | |
lock_acquired = await self.pod_lock_manager.acquire_lock( | |
cronjob_id=SPEND_LOG_CLEANUP_JOB_NAME, | |
) | |
verbose_proxy_logger.info( | |
f"Lock acquisition attempt: {'successful' if lock_acquired else 'failed'} at {datetime.now()}" | |
) | |
if not lock_acquired: | |
verbose_proxy_logger.info("Another pod is already running cleanup") | |
return | |
cutoff_date = datetime.now(timezone.utc) - timedelta( | |
seconds=float(self.retention_seconds) | |
) | |
verbose_proxy_logger.info( | |
f"Deleting logs older than {cutoff_date.isoformat()}" | |
) | |
# Perform the actual deletion | |
total_deleted = await self._delete_old_logs(prisma_client, cutoff_date) | |
verbose_proxy_logger.info(f"Deleted {total_deleted} logs") | |
except Exception as e: | |
verbose_proxy_logger.error(f"Error during cleanup: {str(e)}") | |
return # Return after error handling | |
finally: | |
# Always release the lock if we have a pod lock manager | |
if self.pod_lock_manager and self.pod_lock_manager.redis_cache: | |
await self.pod_lock_manager.release_lock( | |
cronjob_id=SPEND_LOG_CLEANUP_JOB_NAME | |
) | |
verbose_proxy_logger.info("Released cleanup lock") | |