Spaces:
Configuration error
Configuration error
File size: 6,395 Bytes
447ebeb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
import asyncio
from datetime import datetime, timedelta, timezone
from typing import Optional
from litellm._logging import verbose_proxy_logger
from litellm.caching import RedisCache
from litellm.constants import (
SPEND_LOG_CLEANUP_BATCH_SIZE,
SPEND_LOG_CLEANUP_JOB_NAME,
SPEND_LOG_RUN_LOOPS,
)
from litellm.litellm_core_utils.duration_parser import duration_in_seconds
from litellm.proxy.utils import PrismaClient
class SpendLogCleanup:
"""
Handles cleaning up old spend logs based on maximum retention period.
Deletes logs in batches to prevent timeouts.
Uses PodLockManager to ensure only one pod runs cleanup in multi-pod deployments.
"""
def __init__(self, general_settings=None, redis_cache: Optional[RedisCache] = None):
self.batch_size = SPEND_LOG_CLEANUP_BATCH_SIZE
self.retention_seconds: Optional[int] = None
from litellm.proxy.proxy_server import general_settings as default_settings
self.general_settings = general_settings or default_settings
from litellm.proxy.proxy_server import proxy_logging_obj
pod_lock_manager = proxy_logging_obj.db_spend_update_writer.pod_lock_manager
self.pod_lock_manager = pod_lock_manager
verbose_proxy_logger.info(
f"SpendLogCleanup initialized with batch size: {self.batch_size}"
)
def _should_delete_spend_logs(self) -> bool:
"""
Determines if logs should be deleted based on the max retention period in settings.
"""
retention_setting = self.general_settings.get(
"maximum_spend_logs_retention_period"
)
verbose_proxy_logger.info(f"Checking retention setting: {retention_setting}")
if retention_setting is None:
verbose_proxy_logger.info("No retention setting found")
return False
try:
if isinstance(retention_setting, int):
retention_setting = str(retention_setting)
self.retention_seconds = duration_in_seconds(retention_setting)
verbose_proxy_logger.info(
f"Retention period set to {self.retention_seconds} seconds"
)
return True
except ValueError as e:
verbose_proxy_logger.error(
f"Invalid maximum_spend_logs_retention_period value: {retention_setting}, error: {str(e)}"
)
return False
async def _delete_old_logs(
self, prisma_client: PrismaClient, cutoff_date: datetime
) -> int:
"""
Helper method to delete old logs in batches.
Returns the total number of logs deleted.
"""
total_deleted = 0
run_count = 0
while True:
if run_count > SPEND_LOG_RUN_LOOPS:
verbose_proxy_logger.info(
"Max logs deleted - 1,00,000, rest of the logs will be deleted in next run"
)
break
# Step 1: Find logs to delete
logs_to_delete = await prisma_client.db.litellm_spendlogs.find_many(
where={"startTime": {"lt": cutoff_date}},
take=self.batch_size,
)
verbose_proxy_logger.info(f"Found {len(logs_to_delete)} logs in this batch")
if not logs_to_delete:
verbose_proxy_logger.info(
f"No more logs to delete. Total deleted: {total_deleted}"
)
break
request_ids = [log.request_id for log in logs_to_delete]
# Step 2: Delete them in one go
await prisma_client.db.litellm_spendlogs.delete_many(
where={"request_id": {"in": request_ids}}
)
total_deleted += len(logs_to_delete)
run_count += 1
# Add a small sleep to prevent overwhelming the database
await asyncio.sleep(0.1)
return total_deleted
async def cleanup_old_spend_logs(self, prisma_client: PrismaClient) -> None:
"""
Main cleanup function. Deletes old spend logs in batches.
If pod_lock_manager is available, ensures only one pod runs cleanup.
If no pod_lock_manager, runs cleanup without distributed locking.
"""
try:
verbose_proxy_logger.info(f"Cleanup job triggered at {datetime.now()}")
if not self._should_delete_spend_logs():
verbose_proxy_logger.info(
"Skipping cleanup — invalid or missing retention setting."
)
return
if self.retention_seconds is None:
verbose_proxy_logger.error(
"Retention seconds is None, cannot proceed with cleanup"
)
return
# If we have a pod lock manager, try to acquire the lock
if self.pod_lock_manager and self.pod_lock_manager.redis_cache:
lock_acquired = await self.pod_lock_manager.acquire_lock(
cronjob_id=SPEND_LOG_CLEANUP_JOB_NAME,
)
verbose_proxy_logger.info(
f"Lock acquisition attempt: {'successful' if lock_acquired else 'failed'} at {datetime.now()}"
)
if not lock_acquired:
verbose_proxy_logger.info("Another pod is already running cleanup")
return
cutoff_date = datetime.now(timezone.utc) - timedelta(
seconds=float(self.retention_seconds)
)
verbose_proxy_logger.info(
f"Deleting logs older than {cutoff_date.isoformat()}"
)
# Perform the actual deletion
total_deleted = await self._delete_old_logs(prisma_client, cutoff_date)
verbose_proxy_logger.info(f"Deleted {total_deleted} logs")
except Exception as e:
verbose_proxy_logger.error(f"Error during cleanup: {str(e)}")
return # Return after error handling
finally:
# Always release the lock if we have a pod lock manager
if self.pod_lock_manager and self.pod_lock_manager.redis_cache:
await self.pod_lock_manager.release_lock(
cronjob_id=SPEND_LOG_CLEANUP_JOB_NAME
)
verbose_proxy_logger.info("Released cleanup lock")
|