File size: 6,395 Bytes
447ebeb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import asyncio
from datetime import datetime, timedelta, timezone
from typing import Optional

from litellm._logging import verbose_proxy_logger
from litellm.caching import RedisCache
from litellm.constants import (
    SPEND_LOG_CLEANUP_BATCH_SIZE,
    SPEND_LOG_CLEANUP_JOB_NAME,
    SPEND_LOG_RUN_LOOPS,
)
from litellm.litellm_core_utils.duration_parser import duration_in_seconds
from litellm.proxy.utils import PrismaClient


class SpendLogCleanup:
    """
    Handles cleaning up old spend logs based on maximum retention period.
    Deletes logs in batches to prevent timeouts.
    Uses PodLockManager to ensure only one pod runs cleanup in multi-pod deployments.
    """

    def __init__(self, general_settings=None, redis_cache: Optional[RedisCache] = None):
        self.batch_size = SPEND_LOG_CLEANUP_BATCH_SIZE
        self.retention_seconds: Optional[int] = None
        from litellm.proxy.proxy_server import general_settings as default_settings

        self.general_settings = general_settings or default_settings
        from litellm.proxy.proxy_server import proxy_logging_obj

        pod_lock_manager = proxy_logging_obj.db_spend_update_writer.pod_lock_manager
        self.pod_lock_manager = pod_lock_manager
        verbose_proxy_logger.info(
            f"SpendLogCleanup initialized with batch size: {self.batch_size}"
        )

    def _should_delete_spend_logs(self) -> bool:
        """
        Determines if logs should be deleted based on the max retention period in settings.
        """
        retention_setting = self.general_settings.get(
            "maximum_spend_logs_retention_period"
        )
        verbose_proxy_logger.info(f"Checking retention setting: {retention_setting}")

        if retention_setting is None:
            verbose_proxy_logger.info("No retention setting found")
            return False

        try:
            if isinstance(retention_setting, int):
                retention_setting = str(retention_setting)
            self.retention_seconds = duration_in_seconds(retention_setting)
            verbose_proxy_logger.info(
                f"Retention period set to {self.retention_seconds} seconds"
            )
            return True
        except ValueError as e:
            verbose_proxy_logger.error(
                f"Invalid maximum_spend_logs_retention_period value: {retention_setting}, error: {str(e)}"
            )
            return False

    async def _delete_old_logs(
        self, prisma_client: PrismaClient, cutoff_date: datetime
    ) -> int:
        """
        Helper method to delete old logs in batches.
        Returns the total number of logs deleted.
        """
        total_deleted = 0
        run_count = 0
        while True:
            if run_count > SPEND_LOG_RUN_LOOPS:
                verbose_proxy_logger.info(
                    "Max logs deleted - 1,00,000, rest of the logs will be deleted in next run"
                )
                break
            # Step 1: Find logs to delete
            logs_to_delete = await prisma_client.db.litellm_spendlogs.find_many(
                where={"startTime": {"lt": cutoff_date}},
                take=self.batch_size,
            )
            verbose_proxy_logger.info(f"Found {len(logs_to_delete)} logs in this batch")

            if not logs_to_delete:
                verbose_proxy_logger.info(
                    f"No more logs to delete. Total deleted: {total_deleted}"
                )
                break

            request_ids = [log.request_id for log in logs_to_delete]

            # Step 2: Delete them in one go
            await prisma_client.db.litellm_spendlogs.delete_many(
                where={"request_id": {"in": request_ids}}
            )

            total_deleted += len(logs_to_delete)
            run_count += 1

            # Add a small sleep to prevent overwhelming the database
            await asyncio.sleep(0.1)

        return total_deleted

    async def cleanup_old_spend_logs(self, prisma_client: PrismaClient) -> None:
        """
        Main cleanup function. Deletes old spend logs in batches.
        If pod_lock_manager is available, ensures only one pod runs cleanup.
        If no pod_lock_manager, runs cleanup without distributed locking.
        """
        try:
            verbose_proxy_logger.info(f"Cleanup job triggered at {datetime.now()}")

            if not self._should_delete_spend_logs():
                verbose_proxy_logger.info(
                    "Skipping cleanup — invalid or missing retention setting."
                )
                return

            if self.retention_seconds is None:
                verbose_proxy_logger.error(
                    "Retention seconds is None, cannot proceed with cleanup"
                )
                return

            # If we have a pod lock manager, try to acquire the lock
            if self.pod_lock_manager and self.pod_lock_manager.redis_cache:
                lock_acquired = await self.pod_lock_manager.acquire_lock(
                    cronjob_id=SPEND_LOG_CLEANUP_JOB_NAME,
                )
                verbose_proxy_logger.info(
                    f"Lock acquisition attempt: {'successful' if lock_acquired else 'failed'}  at {datetime.now()}"
                )

                if not lock_acquired:
                    verbose_proxy_logger.info("Another pod is already running cleanup")
                    return

            cutoff_date = datetime.now(timezone.utc) - timedelta(
                seconds=float(self.retention_seconds)
            )
            verbose_proxy_logger.info(
                f"Deleting logs older than {cutoff_date.isoformat()}"
            )

            # Perform the actual deletion
            total_deleted = await self._delete_old_logs(prisma_client, cutoff_date)
            verbose_proxy_logger.info(f"Deleted {total_deleted} logs")

        except Exception as e:
            verbose_proxy_logger.error(f"Error during cleanup: {str(e)}")
            return  # Return after error handling
        finally:
            # Always release the lock if we have a pod lock manager
            if self.pod_lock_manager and self.pod_lock_manager.redis_cache:
                await self.pod_lock_manager.release_lock(
                    cronjob_id=SPEND_LOG_CLEANUP_JOB_NAME
                )
                verbose_proxy_logger.info("Released cleanup lock")