Spaces:
Configuration error
Configuration error
File size: 6,216 Bytes
447ebeb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
"""
Class to check for LLM API hanging requests
Notes:
- Do not create tasks that sleep, that can saturate the event loop
- Do not store large objects (eg. messages in memory) that can increase RAM usage
"""
import asyncio
from typing import TYPE_CHECKING, Any, Optional
import litellm
from litellm._logging import verbose_proxy_logger
from litellm.caching.in_memory_cache import InMemoryCache
from litellm.litellm_core_utils.core_helpers import get_litellm_metadata_from_kwargs
from litellm.types.integrations.slack_alerting import (
HANGING_ALERT_BUFFER_TIME_SECONDS,
MAX_OLDEST_HANGING_REQUESTS_TO_CHECK,
HangingRequestData,
)
if TYPE_CHECKING:
from litellm.integrations.SlackAlerting.slack_alerting import SlackAlerting
else:
SlackAlerting = Any
class AlertingHangingRequestCheck:
"""
Class to safely handle checking hanging requests alerts
"""
def __init__(
self,
slack_alerting_object: SlackAlerting,
):
self.slack_alerting_object = slack_alerting_object
self.hanging_request_cache = InMemoryCache(
default_ttl=int(
self.slack_alerting_object.alerting_threshold
+ HANGING_ALERT_BUFFER_TIME_SECONDS
),
)
async def add_request_to_hanging_request_check(
self,
request_data: Optional[dict] = None,
):
"""
Add a request to the hanging request cache. This is the list of request_ids that gets periodicall checked for hanging requests
"""
if request_data is None:
return
request_metadata = get_litellm_metadata_from_kwargs(kwargs=request_data)
model = request_data.get("model", "")
api_base: Optional[str] = None
if request_data.get("deployment", None) is not None and isinstance(
request_data["deployment"], dict
):
api_base = litellm.get_api_base(
model=model,
optional_params=request_data["deployment"].get("litellm_params", {}),
)
hanging_request_data = HangingRequestData(
request_id=request_data.get("litellm_call_id", ""),
model=model,
api_base=api_base,
key_alias=request_metadata.get("user_api_key_alias", ""),
team_alias=request_metadata.get("user_api_key_team_alias", ""),
)
await self.hanging_request_cache.async_set_cache(
key=hanging_request_data.request_id,
value=hanging_request_data,
ttl=int(
self.slack_alerting_object.alerting_threshold
+ HANGING_ALERT_BUFFER_TIME_SECONDS
),
)
return
async def send_alerts_for_hanging_requests(self):
"""
Send alerts for hanging requests
"""
from litellm.proxy.proxy_server import proxy_logging_obj
#########################################################
# Find all requests that have been hanging for more than the alerting threshold
# Get the last 50 oldest items in the cache and check if they have completed
#########################################################
# check if request_id is in internal usage cache
if proxy_logging_obj.internal_usage_cache is None:
return
hanging_requests = await self.hanging_request_cache.async_get_oldest_n_keys(
n=MAX_OLDEST_HANGING_REQUESTS_TO_CHECK,
)
for request_id in hanging_requests:
hanging_request_data: Optional[HangingRequestData] = (
await self.hanging_request_cache.async_get_cache(
key=request_id,
)
)
if hanging_request_data is None:
continue
request_status = (
await proxy_logging_obj.internal_usage_cache.async_get_cache(
key="request_status:{}".format(hanging_request_data.request_id),
litellm_parent_otel_span=None,
local_only=True,
)
)
# this means the request status was either success or fail
# and is not hanging
if request_status is not None:
# clear this request from hanging request cache since the request was either success or failed
self.hanging_request_cache._remove_key(
key=request_id,
)
continue
################
# Send the Alert on Slack
################
await self.send_hanging_request_alert(
hanging_request_data=hanging_request_data
)
return
async def check_for_hanging_requests(
self,
):
"""
Background task that checks all request ids in self.hanging_request_cache to check if they have completed
Runs every alerting_threshold/2 seconds to check for hanging requests
"""
while True:
verbose_proxy_logger.debug("Checking for hanging requests....")
await self.send_alerts_for_hanging_requests()
await asyncio.sleep(self.slack_alerting_object.alerting_threshold / 2)
async def send_hanging_request_alert(
self,
hanging_request_data: HangingRequestData,
):
"""
Send a hanging request alert
"""
from litellm.integrations.SlackAlerting.slack_alerting import AlertType
################
# Send the Alert on Slack
################
request_info = f"""Request Model: `{hanging_request_data.model}`
API Base: `{hanging_request_data.api_base}`
Key Alias: `{hanging_request_data.key_alias}`
Team Alias: `{hanging_request_data.team_alias}`"""
alerting_message = f"`Requests are hanging - {self.slack_alerting_object.alerting_threshold}s+ request time`"
await self.slack_alerting_object.send_alert(
message=alerting_message + "\n" + request_info,
level="Medium",
alert_type=AlertType.llm_requests_hanging,
alerting_metadata=hanging_request_data.alerting_metadata or {},
)
|