Spaces:

Shyamnath
/

inferencing-llm

Running

App Files Files Community

inferencing-llm / litellm /responses /litellm_completion_transformation /streaming_iterator.py

Shyamnath

Push core package and essential files

469eae6 2 months ago

raw

history blame contribute delete

6.04 kB

	from typing import List, Optional, Union

	import litellm
	from litellm.main import stream_chunk_builder
	from litellm.responses.litellm_completion_transformation.transformation import (
	LiteLLMCompletionResponsesConfig,
	)
	from litellm.responses.streaming_iterator import ResponsesAPIStreamingIterator
	from litellm.types.llms.openai import (
	OutputTextDeltaEvent,
	ResponseCompletedEvent,
	ResponseInputParam,
	ResponsesAPIOptionalRequestParams,
	ResponsesAPIStreamEvents,
	ResponsesAPIStreamingResponse,
	)
	from litellm.types.utils import Delta as ChatCompletionDelta
	from litellm.types.utils import (
	ModelResponse,
	ModelResponseStream,
	StreamingChoices,
	TextCompletionResponse,
	)


	class LiteLLMCompletionStreamingIterator(ResponsesAPIStreamingIterator):
	"""
	Async iterator for processing streaming responses from the Responses API.
	"""

	def __init__(
	self,
	litellm_custom_stream_wrapper: litellm.CustomStreamWrapper,
	request_input: Union[str, ResponseInputParam],
	responses_api_request: ResponsesAPIOptionalRequestParams,
	):
	self.litellm_custom_stream_wrapper: litellm.CustomStreamWrapper = (
	litellm_custom_stream_wrapper
	)
	self.request_input: Union[str, ResponseInputParam] = request_input
	self.responses_api_request: ResponsesAPIOptionalRequestParams = (
	responses_api_request
	)
	self.collected_chat_completion_chunks: List[ModelResponseStream] = []
	self.finished: bool = False

	async def __anext__(
	self,
	) -> Union[ResponsesAPIStreamingResponse, ResponseCompletedEvent]:
	try:
	while True:
	if self.finished is True:
	raise StopAsyncIteration
	# Get the next chunk from the stream
	try:
	chunk = await self.litellm_custom_stream_wrapper.__anext__()
	self.collected_chat_completion_chunks.append(chunk)
	response_api_chunk = (
	self._transform_chat_completion_chunk_to_response_api_chunk(
	chunk
	)
	)
	if response_api_chunk:
	return response_api_chunk
	except StopAsyncIteration:
	self.finished = True
	response_completed_event = self._emit_response_completed_event()
	if response_completed_event:
	return response_completed_event
	else:
	raise StopAsyncIteration

	except Exception as e:
	# Handle HTTP errors
	self.finished = True
	raise e

	def __iter__(self):
	return self

	def __next__(
	self,
	) -> Union[ResponsesAPIStreamingResponse, ResponseCompletedEvent]:
	try:
	while True:
	if self.finished is True:
	raise StopIteration
	# Get the next chunk from the stream
	try:
	chunk = self.litellm_custom_stream_wrapper.__next__()
	self.collected_chat_completion_chunks.append(chunk)
	response_api_chunk = (
	self._transform_chat_completion_chunk_to_response_api_chunk(
	chunk
	)
	)
	if response_api_chunk:
	return response_api_chunk
	except StopIteration:
	self.finished = True
	response_completed_event = self._emit_response_completed_event()
	if response_completed_event:
	return response_completed_event
	else:
	raise StopIteration

	except Exception as e:
	# Handle HTTP errors
	self.finished = True
	raise e

	def _transform_chat_completion_chunk_to_response_api_chunk(
	self, chunk: ModelResponseStream
	) -> Optional[ResponsesAPIStreamingResponse]:
	"""
	Transform a chat completion chunk to a response API chunk.

	This currently only handles emitting the OutputTextDeltaEvent, which is used by other tools using the responses API.
	"""
	return OutputTextDeltaEvent(
	type=ResponsesAPIStreamEvents.OUTPUT_TEXT_DELTA,
	item_id=chunk.id,
	output_index=0,
	content_index=0,
	delta=self._get_delta_string_from_streaming_choices(chunk.choices),
	)

	def _get_delta_string_from_streaming_choices(
	self, choices: List[StreamingChoices]
	) -> str:
	"""
	Get the delta string from the streaming choices

	For now this collected the first choice's delta string.

	It's unclear how users expect litellm to translate multiple-choices-per-chunk to the responses API output.
	"""
	choice = choices[0]
	chat_completion_delta: ChatCompletionDelta = choice.delta
	return chat_completion_delta.content or ""

	def _emit_response_completed_event(self) -> Optional[ResponseCompletedEvent]:
	litellm_model_response: Optional[
	Union[ModelResponse, TextCompletionResponse]
	] = stream_chunk_builder(chunks=self.collected_chat_completion_chunks)
	if litellm_model_response and isinstance(litellm_model_response, ModelResponse):

	return ResponseCompletedEvent(
	type=ResponsesAPIStreamEvents.RESPONSE_COMPLETED,
	response=LiteLLMCompletionResponsesConfig.transform_chat_completion_response_to_responses_api_response(
	request_input=self.request_input,
	chat_completion_response=litellm_model_response,
	responses_api_request=self.responses_api_request,
	),
	)
	else:
	return None