test3 / litellm /integrations /SlackAlerting /hanging_request_check.py
DesertWolf's picture
Upload folder using huggingface_hub
447ebeb verified
"""
Class to check for LLM API hanging requests
Notes:
- Do not create tasks that sleep, that can saturate the event loop
- Do not store large objects (eg. messages in memory) that can increase RAM usage
"""
import asyncio
from typing import TYPE_CHECKING, Any, Optional
import litellm
from litellm._logging import verbose_proxy_logger
from litellm.caching.in_memory_cache import InMemoryCache
from litellm.litellm_core_utils.core_helpers import get_litellm_metadata_from_kwargs
from litellm.types.integrations.slack_alerting import (
HANGING_ALERT_BUFFER_TIME_SECONDS,
MAX_OLDEST_HANGING_REQUESTS_TO_CHECK,
HangingRequestData,
)
if TYPE_CHECKING:
from litellm.integrations.SlackAlerting.slack_alerting import SlackAlerting
else:
SlackAlerting = Any
class AlertingHangingRequestCheck:
"""
Class to safely handle checking hanging requests alerts
"""
def __init__(
self,
slack_alerting_object: SlackAlerting,
):
self.slack_alerting_object = slack_alerting_object
self.hanging_request_cache = InMemoryCache(
default_ttl=int(
self.slack_alerting_object.alerting_threshold
+ HANGING_ALERT_BUFFER_TIME_SECONDS
),
)
async def add_request_to_hanging_request_check(
self,
request_data: Optional[dict] = None,
):
"""
Add a request to the hanging request cache. This is the list of request_ids that gets periodicall checked for hanging requests
"""
if request_data is None:
return
request_metadata = get_litellm_metadata_from_kwargs(kwargs=request_data)
model = request_data.get("model", "")
api_base: Optional[str] = None
if request_data.get("deployment", None) is not None and isinstance(
request_data["deployment"], dict
):
api_base = litellm.get_api_base(
model=model,
optional_params=request_data["deployment"].get("litellm_params", {}),
)
hanging_request_data = HangingRequestData(
request_id=request_data.get("litellm_call_id", ""),
model=model,
api_base=api_base,
key_alias=request_metadata.get("user_api_key_alias", ""),
team_alias=request_metadata.get("user_api_key_team_alias", ""),
)
await self.hanging_request_cache.async_set_cache(
key=hanging_request_data.request_id,
value=hanging_request_data,
ttl=int(
self.slack_alerting_object.alerting_threshold
+ HANGING_ALERT_BUFFER_TIME_SECONDS
),
)
return
async def send_alerts_for_hanging_requests(self):
"""
Send alerts for hanging requests
"""
from litellm.proxy.proxy_server import proxy_logging_obj
#########################################################
# Find all requests that have been hanging for more than the alerting threshold
# Get the last 50 oldest items in the cache and check if they have completed
#########################################################
# check if request_id is in internal usage cache
if proxy_logging_obj.internal_usage_cache is None:
return
hanging_requests = await self.hanging_request_cache.async_get_oldest_n_keys(
n=MAX_OLDEST_HANGING_REQUESTS_TO_CHECK,
)
for request_id in hanging_requests:
hanging_request_data: Optional[HangingRequestData] = (
await self.hanging_request_cache.async_get_cache(
key=request_id,
)
)
if hanging_request_data is None:
continue
request_status = (
await proxy_logging_obj.internal_usage_cache.async_get_cache(
key="request_status:{}".format(hanging_request_data.request_id),
litellm_parent_otel_span=None,
local_only=True,
)
)
# this means the request status was either success or fail
# and is not hanging
if request_status is not None:
# clear this request from hanging request cache since the request was either success or failed
self.hanging_request_cache._remove_key(
key=request_id,
)
continue
################
# Send the Alert on Slack
################
await self.send_hanging_request_alert(
hanging_request_data=hanging_request_data
)
return
async def check_for_hanging_requests(
self,
):
"""
Background task that checks all request ids in self.hanging_request_cache to check if they have completed
Runs every alerting_threshold/2 seconds to check for hanging requests
"""
while True:
verbose_proxy_logger.debug("Checking for hanging requests....")
await self.send_alerts_for_hanging_requests()
await asyncio.sleep(self.slack_alerting_object.alerting_threshold / 2)
async def send_hanging_request_alert(
self,
hanging_request_data: HangingRequestData,
):
"""
Send a hanging request alert
"""
from litellm.integrations.SlackAlerting.slack_alerting import AlertType
################
# Send the Alert on Slack
################
request_info = f"""Request Model: `{hanging_request_data.model}`
API Base: `{hanging_request_data.api_base}`
Key Alias: `{hanging_request_data.key_alias}`
Team Alias: `{hanging_request_data.team_alias}`"""
alerting_message = f"`Requests are hanging - {self.slack_alerting_object.alerting_threshold}s+ request time`"
await self.slack_alerting_object.send_alert(
message=alerting_message + "\n" + request_info,
level="Medium",
alert_type=AlertType.llm_requests_hanging,
alerting_metadata=hanging_request_data.alerting_metadata or {},
)