""" Class to check for LLM API hanging requests Notes: - Do not create tasks that sleep, that can saturate the event loop - Do not store large objects (eg. messages in memory) that can increase RAM usage """ import asyncio from typing import TYPE_CHECKING, Any, Optional import litellm from litellm._logging import verbose_proxy_logger from litellm.caching.in_memory_cache import InMemoryCache from litellm.litellm_core_utils.core_helpers import get_litellm_metadata_from_kwargs from litellm.types.integrations.slack_alerting import ( HANGING_ALERT_BUFFER_TIME_SECONDS, MAX_OLDEST_HANGING_REQUESTS_TO_CHECK, HangingRequestData, ) if TYPE_CHECKING: from litellm.integrations.SlackAlerting.slack_alerting import SlackAlerting else: SlackAlerting = Any class AlertingHangingRequestCheck: """ Class to safely handle checking hanging requests alerts """ def __init__( self, slack_alerting_object: SlackAlerting, ): self.slack_alerting_object = slack_alerting_object self.hanging_request_cache = InMemoryCache( default_ttl=int( self.slack_alerting_object.alerting_threshold + HANGING_ALERT_BUFFER_TIME_SECONDS ), ) async def add_request_to_hanging_request_check( self, request_data: Optional[dict] = None, ): """ Add a request to the hanging request cache. This is the list of request_ids that gets periodicall checked for hanging requests """ if request_data is None: return request_metadata = get_litellm_metadata_from_kwargs(kwargs=request_data) model = request_data.get("model", "") api_base: Optional[str] = None if request_data.get("deployment", None) is not None and isinstance( request_data["deployment"], dict ): api_base = litellm.get_api_base( model=model, optional_params=request_data["deployment"].get("litellm_params", {}), ) hanging_request_data = HangingRequestData( request_id=request_data.get("litellm_call_id", ""), model=model, api_base=api_base, key_alias=request_metadata.get("user_api_key_alias", ""), team_alias=request_metadata.get("user_api_key_team_alias", ""), ) await self.hanging_request_cache.async_set_cache( key=hanging_request_data.request_id, value=hanging_request_data, ttl=int( self.slack_alerting_object.alerting_threshold + HANGING_ALERT_BUFFER_TIME_SECONDS ), ) return async def send_alerts_for_hanging_requests(self): """ Send alerts for hanging requests """ from litellm.proxy.proxy_server import proxy_logging_obj ######################################################### # Find all requests that have been hanging for more than the alerting threshold # Get the last 50 oldest items in the cache and check if they have completed ######################################################### # check if request_id is in internal usage cache if proxy_logging_obj.internal_usage_cache is None: return hanging_requests = await self.hanging_request_cache.async_get_oldest_n_keys( n=MAX_OLDEST_HANGING_REQUESTS_TO_CHECK, ) for request_id in hanging_requests: hanging_request_data: Optional[HangingRequestData] = ( await self.hanging_request_cache.async_get_cache( key=request_id, ) ) if hanging_request_data is None: continue request_status = ( await proxy_logging_obj.internal_usage_cache.async_get_cache( key="request_status:{}".format(hanging_request_data.request_id), litellm_parent_otel_span=None, local_only=True, ) ) # this means the request status was either success or fail # and is not hanging if request_status is not None: # clear this request from hanging request cache since the request was either success or failed self.hanging_request_cache._remove_key( key=request_id, ) continue ################ # Send the Alert on Slack ################ await self.send_hanging_request_alert( hanging_request_data=hanging_request_data ) return async def check_for_hanging_requests( self, ): """ Background task that checks all request ids in self.hanging_request_cache to check if they have completed Runs every alerting_threshold/2 seconds to check for hanging requests """ while True: verbose_proxy_logger.debug("Checking for hanging requests....") await self.send_alerts_for_hanging_requests() await asyncio.sleep(self.slack_alerting_object.alerting_threshold / 2) async def send_hanging_request_alert( self, hanging_request_data: HangingRequestData, ): """ Send a hanging request alert """ from litellm.integrations.SlackAlerting.slack_alerting import AlertType ################ # Send the Alert on Slack ################ request_info = f"""Request Model: `{hanging_request_data.model}` API Base: `{hanging_request_data.api_base}` Key Alias: `{hanging_request_data.key_alias}` Team Alias: `{hanging_request_data.team_alias}`""" alerting_message = f"`Requests are hanging - {self.slack_alerting_object.alerting_threshold}s+ request time`" await self.slack_alerting_object.send_alert( message=alerting_message + "\n" + request_info, level="Medium", alert_type=AlertType.llm_requests_hanging, alerting_metadata=hanging_request_data.alerting_metadata or {}, )