DesertWolf's picture
Upload folder using huggingface_hub
447ebeb verified
import asyncio
from datetime import datetime, timedelta, timezone
from typing import Optional
from litellm._logging import verbose_proxy_logger
from litellm.caching import RedisCache
from litellm.constants import (
SPEND_LOG_CLEANUP_BATCH_SIZE,
SPEND_LOG_CLEANUP_JOB_NAME,
SPEND_LOG_RUN_LOOPS,
)
from litellm.litellm_core_utils.duration_parser import duration_in_seconds
from litellm.proxy.utils import PrismaClient
class SpendLogCleanup:
"""
Handles cleaning up old spend logs based on maximum retention period.
Deletes logs in batches to prevent timeouts.
Uses PodLockManager to ensure only one pod runs cleanup in multi-pod deployments.
"""
def __init__(self, general_settings=None, redis_cache: Optional[RedisCache] = None):
self.batch_size = SPEND_LOG_CLEANUP_BATCH_SIZE
self.retention_seconds: Optional[int] = None
from litellm.proxy.proxy_server import general_settings as default_settings
self.general_settings = general_settings or default_settings
from litellm.proxy.proxy_server import proxy_logging_obj
pod_lock_manager = proxy_logging_obj.db_spend_update_writer.pod_lock_manager
self.pod_lock_manager = pod_lock_manager
verbose_proxy_logger.info(
f"SpendLogCleanup initialized with batch size: {self.batch_size}"
)
def _should_delete_spend_logs(self) -> bool:
"""
Determines if logs should be deleted based on the max retention period in settings.
"""
retention_setting = self.general_settings.get(
"maximum_spend_logs_retention_period"
)
verbose_proxy_logger.info(f"Checking retention setting: {retention_setting}")
if retention_setting is None:
verbose_proxy_logger.info("No retention setting found")
return False
try:
if isinstance(retention_setting, int):
retention_setting = str(retention_setting)
self.retention_seconds = duration_in_seconds(retention_setting)
verbose_proxy_logger.info(
f"Retention period set to {self.retention_seconds} seconds"
)
return True
except ValueError as e:
verbose_proxy_logger.error(
f"Invalid maximum_spend_logs_retention_period value: {retention_setting}, error: {str(e)}"
)
return False
async def _delete_old_logs(
self, prisma_client: PrismaClient, cutoff_date: datetime
) -> int:
"""
Helper method to delete old logs in batches.
Returns the total number of logs deleted.
"""
total_deleted = 0
run_count = 0
while True:
if run_count > SPEND_LOG_RUN_LOOPS:
verbose_proxy_logger.info(
"Max logs deleted - 1,00,000, rest of the logs will be deleted in next run"
)
break
# Step 1: Find logs to delete
logs_to_delete = await prisma_client.db.litellm_spendlogs.find_many(
where={"startTime": {"lt": cutoff_date}},
take=self.batch_size,
)
verbose_proxy_logger.info(f"Found {len(logs_to_delete)} logs in this batch")
if not logs_to_delete:
verbose_proxy_logger.info(
f"No more logs to delete. Total deleted: {total_deleted}"
)
break
request_ids = [log.request_id for log in logs_to_delete]
# Step 2: Delete them in one go
await prisma_client.db.litellm_spendlogs.delete_many(
where={"request_id": {"in": request_ids}}
)
total_deleted += len(logs_to_delete)
run_count += 1
# Add a small sleep to prevent overwhelming the database
await asyncio.sleep(0.1)
return total_deleted
async def cleanup_old_spend_logs(self, prisma_client: PrismaClient) -> None:
"""
Main cleanup function. Deletes old spend logs in batches.
If pod_lock_manager is available, ensures only one pod runs cleanup.
If no pod_lock_manager, runs cleanup without distributed locking.
"""
try:
verbose_proxy_logger.info(f"Cleanup job triggered at {datetime.now()}")
if not self._should_delete_spend_logs():
verbose_proxy_logger.info(
"Skipping cleanup — invalid or missing retention setting."
)
return
if self.retention_seconds is None:
verbose_proxy_logger.error(
"Retention seconds is None, cannot proceed with cleanup"
)
return
# If we have a pod lock manager, try to acquire the lock
if self.pod_lock_manager and self.pod_lock_manager.redis_cache:
lock_acquired = await self.pod_lock_manager.acquire_lock(
cronjob_id=SPEND_LOG_CLEANUP_JOB_NAME,
)
verbose_proxy_logger.info(
f"Lock acquisition attempt: {'successful' if lock_acquired else 'failed'} at {datetime.now()}"
)
if not lock_acquired:
verbose_proxy_logger.info("Another pod is already running cleanup")
return
cutoff_date = datetime.now(timezone.utc) - timedelta(
seconds=float(self.retention_seconds)
)
verbose_proxy_logger.info(
f"Deleting logs older than {cutoff_date.isoformat()}"
)
# Perform the actual deletion
total_deleted = await self._delete_old_logs(prisma_client, cutoff_date)
verbose_proxy_logger.info(f"Deleted {total_deleted} logs")
except Exception as e:
verbose_proxy_logger.error(f"Error during cleanup: {str(e)}")
return # Return after error handling
finally:
# Always release the lock if we have a pod lock manager
if self.pod_lock_manager and self.pod_lock_manager.redis_cache:
await self.pod_lock_manager.release_lock(
cronjob_id=SPEND_LOG_CLEANUP_JOB_NAME
)
verbose_proxy_logger.info("Released cleanup lock")