Spaces:

DesertWolf
/

test3

Configuration error

App Files Files Community

test3 / litellm /integrations /SlackAlerting /hanging_request_check.py

DesertWolf

Upload folder using huggingface_hub

447ebeb verified 14 days ago

raw

history blame contribute delete

6.22 kB

	"""
	Class to check for LLM API hanging requests


	Notes:
	- Do not create tasks that sleep, that can saturate the event loop
	- Do not store large objects (eg. messages in memory) that can increase RAM usage
	"""

	import asyncio
	from typing import TYPE_CHECKING, Any, Optional

	import litellm
	from litellm._logging import verbose_proxy_logger
	from litellm.caching.in_memory_cache import InMemoryCache
	from litellm.litellm_core_utils.core_helpers import get_litellm_metadata_from_kwargs
	from litellm.types.integrations.slack_alerting import (
	HANGING_ALERT_BUFFER_TIME_SECONDS,
	MAX_OLDEST_HANGING_REQUESTS_TO_CHECK,
	HangingRequestData,
	)

	if TYPE_CHECKING:
	from litellm.integrations.SlackAlerting.slack_alerting import SlackAlerting
	else:
	SlackAlerting = Any


	class AlertingHangingRequestCheck:
	"""
	Class to safely handle checking hanging requests alerts
	"""

	def __init__(
	self,
	slack_alerting_object: SlackAlerting,
	):
	self.slack_alerting_object = slack_alerting_object
	self.hanging_request_cache = InMemoryCache(
	default_ttl=int(
	self.slack_alerting_object.alerting_threshold
	+ HANGING_ALERT_BUFFER_TIME_SECONDS
	),
	)

	async def add_request_to_hanging_request_check(
	self,
	request_data: Optional[dict] = None,
	):
	"""
	Add a request to the hanging request cache. This is the list of request_ids that gets periodicall checked for hanging requests
	"""
	if request_data is None:
	return

	request_metadata = get_litellm_metadata_from_kwargs(kwargs=request_data)
	model = request_data.get("model", "")
	api_base: Optional[str] = None

	if request_data.get("deployment", None) is not None and isinstance(
	request_data["deployment"], dict
	):
	api_base = litellm.get_api_base(
	model=model,
	optional_params=request_data["deployment"].get("litellm_params", {}),
	)

	hanging_request_data = HangingRequestData(
	request_id=request_data.get("litellm_call_id", ""),
	model=model,
	api_base=api_base,
	key_alias=request_metadata.get("user_api_key_alias", ""),
	team_alias=request_metadata.get("user_api_key_team_alias", ""),
	)

	await self.hanging_request_cache.async_set_cache(
	key=hanging_request_data.request_id,
	value=hanging_request_data,
	ttl=int(
	self.slack_alerting_object.alerting_threshold
	+ HANGING_ALERT_BUFFER_TIME_SECONDS
	),
	)
	return

	async def send_alerts_for_hanging_requests(self):
	"""
	Send alerts for hanging requests
	"""
	from litellm.proxy.proxy_server import proxy_logging_obj

	#########################################################
	# Find all requests that have been hanging for more than the alerting threshold
	# Get the last 50 oldest items in the cache and check if they have completed
	#########################################################
	# check if request_id is in internal usage cache
	if proxy_logging_obj.internal_usage_cache is None:
	return

	hanging_requests = await self.hanging_request_cache.async_get_oldest_n_keys(
	n=MAX_OLDEST_HANGING_REQUESTS_TO_CHECK,
	)

	for request_id in hanging_requests:
	hanging_request_data: Optional[HangingRequestData] = (
	await self.hanging_request_cache.async_get_cache(
	key=request_id,
	)
	)

	if hanging_request_data is None:
	continue

	request_status = (
	await proxy_logging_obj.internal_usage_cache.async_get_cache(
	key="request_status:{}".format(hanging_request_data.request_id),
	litellm_parent_otel_span=None,
	local_only=True,
	)
	)
	# this means the request status was either success or fail
	# and is not hanging
	if request_status is not None:
	# clear this request from hanging request cache since the request was either success or failed
	self.hanging_request_cache._remove_key(
	key=request_id,
	)
	continue

	################
	# Send the Alert on Slack
	################
	await self.send_hanging_request_alert(
	hanging_request_data=hanging_request_data
	)

	return

	async def check_for_hanging_requests(
	self,
	):
	"""
	Background task that checks all request ids in self.hanging_request_cache to check if they have completed

	Runs every alerting_threshold/2 seconds to check for hanging requests
	"""
	while True:
	verbose_proxy_logger.debug("Checking for hanging requests....")
	await self.send_alerts_for_hanging_requests()
	await asyncio.sleep(self.slack_alerting_object.alerting_threshold / 2)

	async def send_hanging_request_alert(
	self,
	hanging_request_data: HangingRequestData,
	):
	"""
	Send a hanging request alert
	"""
	from litellm.integrations.SlackAlerting.slack_alerting import AlertType

	################
	# Send the Alert on Slack
	################
	request_info = f"""Request Model: `{hanging_request_data.model}`
	API Base: `{hanging_request_data.api_base}`
	Key Alias: `{hanging_request_data.key_alias}`
	Team Alias: `{hanging_request_data.team_alias}`"""

	alerting_message = f"`Requests are hanging - {self.slack_alerting_object.alerting_threshold}s+ request time`"
	await self.slack_alerting_object.send_alert(
	message=alerting_message + "\n" + request_info,
	level="Medium",
	alert_type=AlertType.llm_requests_hanging,
	alerting_metadata=hanging_request_data.alerting_metadata or {},
	)