Spaces:

Backup-bdg
/

OpenHands

Build error

File size: 35,371 Bytes

51ff9e5

from typing import Generator

from litellm import ModelResponse

from openhands.core.config.agent_config import AgentConfig
from openhands.core.logger import openhands_logger as logger
from openhands.core.message import ImageContent, Message, TextContent
from openhands.core.schema import ActionType
from openhands.events.action import (
    Action,
    AgentDelegateAction,
    AgentFinishAction,
    AgentThinkAction,
    BrowseInteractiveAction,
    BrowseURLAction,
    CmdRunAction,
    FileEditAction,
    FileReadAction,
    IPythonRunCellAction,
    MessageAction,
)
from openhands.events.action.mcp import MCPAction
from openhands.events.action.message import SystemMessageAction
from openhands.events.event import Event, RecallType
from openhands.events.observation import (
    AgentCondensationObservation,
    AgentDelegateObservation,
    AgentThinkObservation,
    BrowserOutputObservation,
    CmdOutputObservation,
    FileEditObservation,
    FileReadObservation,
    IPythonRunCellObservation,
    UserRejectObservation,
)
from openhands.events.observation.agent import (
    MicroagentKnowledge,
    RecallObservation,
)
from openhands.events.observation.error import ErrorObservation
from openhands.events.observation.mcp import MCPObservation
from openhands.events.observation.observation import Observation
from openhands.events.serialization.event import truncate_content
from openhands.utils.prompt import (
    ConversationInstructions,
    PromptManager,
    RepositoryInfo,
    RuntimeInfo,
)


class ConversationMemory:
    """Processes event history into a coherent conversation for the agent."""

    def __init__(self, config: AgentConfig, prompt_manager: PromptManager):
        self.agent_config = config
        self.prompt_manager = prompt_manager

    def process_events(
        self,
        condensed_history: list[Event],
        initial_user_action: MessageAction,
        max_message_chars: int | None = None,
        vision_is_active: bool = False,
    ) -> list[Message]:
        """Process state history into a list of messages for the LLM.

        Ensures that tool call actions are processed correctly in function calling mode.

        Args:
            condensed_history: The condensed history of events to convert
            max_message_chars: The maximum number of characters in the content of an event included
                in the prompt to the LLM. Larger observations are truncated.
            vision_is_active: Whether vision is active in the LLM. If True, image URLs will be included.
            initial_user_action: The initial user message action, if available. Used to ensure the conversation starts correctly.
        """

        events = condensed_history

        # Ensure the event list starts with SystemMessageAction, then MessageAction(source='user')
        self._ensure_system_message(events)
        self._ensure_initial_user_message(events, initial_user_action)

        # log visual browsing status
        logger.debug(f'Visual browsing: {self.agent_config.enable_som_visual_browsing}')

        # Initialize empty messages list
        messages = []

        # Process regular events
        pending_tool_call_action_messages: dict[str, Message] = {}
        tool_call_id_to_message: dict[str, Message] = {}

        for i, event in enumerate(events):
            # create a regular message from an event
            if isinstance(event, Action):
                messages_to_add = self._process_action(
                    action=event,
                    pending_tool_call_action_messages=pending_tool_call_action_messages,
                    vision_is_active=vision_is_active,
                )
            elif isinstance(event, Observation):
                messages_to_add = self._process_observation(
                    obs=event,
                    tool_call_id_to_message=tool_call_id_to_message,
                    max_message_chars=max_message_chars,
                    vision_is_active=vision_is_active,
                    enable_som_visual_browsing=self.agent_config.enable_som_visual_browsing,
                    current_index=i,
                    events=events,
                )
            else:
                raise ValueError(f'Unknown event type: {type(event)}')

            # Check pending tool call action messages and see if they are complete
            _response_ids_to_remove = []
            for (
                response_id,
                pending_message,
            ) in pending_tool_call_action_messages.items():
                assert pending_message.tool_calls is not None, (
                    'Tool calls should NOT be None when function calling is enabled & the message is considered pending tool call. '
                    f'Pending message: {pending_message}'
                )
                if all(
                    tool_call.id in tool_call_id_to_message
                    for tool_call in pending_message.tool_calls
                ):
                    # If complete:
                    # -- 1. Add the message that **initiated** the tool calls
                    messages_to_add.append(pending_message)
                    # -- 2. Add the tool calls **results***
                    for tool_call in pending_message.tool_calls:
                        messages_to_add.append(tool_call_id_to_message[tool_call.id])
                        tool_call_id_to_message.pop(tool_call.id)
                    _response_ids_to_remove.append(response_id)
            # Cleanup the processed pending tool messages
            for response_id in _response_ids_to_remove:
                pending_tool_call_action_messages.pop(response_id)

            messages += messages_to_add

        # Apply final filtering so that the messages in context don't have unmatched tool calls
        # and tool responses, for example
        messages = list(ConversationMemory._filter_unmatched_tool_calls(messages))

        # Apply final formatting
        messages = self._apply_user_message_formatting(messages)

        return messages

    def _apply_user_message_formatting(self, messages: list[Message]) -> list[Message]:
        """Applies formatting rules, such as adding newlines between consecutive user messages."""
        formatted_messages = []
        prev_role = None
        for msg in messages:
            # Add double newline between consecutive user messages
            if msg.role == 'user' and prev_role == 'user' and len(msg.content) > 0:
                # Find the first TextContent in the message to add newlines
                for content_item in msg.content:
                    if isinstance(content_item, TextContent):
                        # Prepend two newlines to ensure visual separation
                        content_item.text = '\n\n' + content_item.text
                        break
            formatted_messages.append(msg)
            prev_role = msg.role  # Update prev_role after processing each message
        return formatted_messages

    def _process_action(
        self,
        action: Action,
        pending_tool_call_action_messages: dict[str, Message],
        vision_is_active: bool = False,
    ) -> list[Message]:
        """Converts an action into a message format that can be sent to the LLM.

        This method handles different types of actions and formats them appropriately:
        1. For tool-based actions (AgentDelegate, CmdRun, IPythonRunCell, FileEdit) and agent-sourced AgentFinish:
            - In function calling mode: Stores the LLM's response in pending_tool_call_action_messages
            - In non-function calling mode: Creates a message with the action string
        2. For MessageActions: Creates a message with the text content and optional image content

        Args:
            action: The action to convert. Can be one of:
                - CmdRunAction: For executing bash commands
                - IPythonRunCellAction: For running IPython code
                - FileEditAction: For editing files
                - FileReadAction: For reading files using openhands-aci commands
                - BrowseInteractiveAction: For browsing the web
                - AgentFinishAction: For ending the interaction
                - MessageAction: For sending messages
                - MCPAction: For interacting with the MCP server
            pending_tool_call_action_messages: Dictionary mapping response IDs to their corresponding messages.
                Used in function calling mode to track tool calls that are waiting for their results.

            vision_is_active: Whether vision is active in the LLM. If True, image URLs will be included

        Returns:
            list[Message]: A list containing the formatted message(s) for the action.
                May be empty if the action is handled as a tool call in function calling mode.

        Note:
            In function calling mode, tool-based actions are stored in pending_tool_call_action_messages
            rather than being returned immediately. They will be processed later when all corresponding
            tool call results are available.
        """
        # create a regular message from an event
        if isinstance(
            action,
            (
                AgentDelegateAction,
                AgentThinkAction,
                IPythonRunCellAction,
                FileEditAction,
                FileReadAction,
                BrowseInteractiveAction,
                BrowseURLAction,
                MCPAction,
            ),
        ) or (isinstance(action, CmdRunAction) and action.source == 'agent'):
            tool_metadata = action.tool_call_metadata
            assert tool_metadata is not None, (
                'Tool call metadata should NOT be None when function calling is enabled. Action: '
                + str(action)
            )

            llm_response: ModelResponse = tool_metadata.model_response
            assistant_msg = getattr(llm_response.choices[0], 'message')

            # Add the LLM message (assistant) that initiated the tool calls
            # (overwrites any previous message with the same response_id)
            logger.debug(
                f'Tool calls type: {type(assistant_msg.tool_calls)}, value: {assistant_msg.tool_calls}'
            )
            pending_tool_call_action_messages[llm_response.id] = Message(
                role=getattr(assistant_msg, 'role', 'assistant'),
                # tool call content SHOULD BE a string
                content=[TextContent(text=assistant_msg.content)]
                if assistant_msg.content and assistant_msg.content.strip()
                else [],
                tool_calls=assistant_msg.tool_calls,
            )
            return []
        elif isinstance(action, AgentFinishAction):
            role = 'user' if action.source == 'user' else 'assistant'

            # when agent finishes, it has tool_metadata
            # which has already been executed, and it doesn't have a response
            # when the user finishes (/exit), we don't have tool_metadata
            tool_metadata = action.tool_call_metadata
            if tool_metadata is not None:
                # take the response message from the tool call
                assistant_msg = getattr(
                    tool_metadata.model_response.choices[0], 'message'
                )
                content = assistant_msg.content or ''

                # save content if any, to thought
                if action.thought:
                    if action.thought != content:
                        action.thought += '\n' + content
                else:
                    action.thought = content

                # remove the tool call metadata
                action.tool_call_metadata = None
            if role not in ('user', 'system', 'assistant', 'tool'):
                raise ValueError(f'Invalid role: {role}')
            return [
                Message(
                    role=role,  # type: ignore[arg-type]
                    content=[TextContent(text=action.thought)],
                )
            ]
        elif isinstance(action, MessageAction):
            role = 'user' if action.source == 'user' else 'assistant'
            content = [TextContent(text=action.content or '')]
            if vision_is_active and action.image_urls:
                content.append(ImageContent(image_urls=action.image_urls))
            if role not in ('user', 'system', 'assistant', 'tool'):
                raise ValueError(f'Invalid role: {role}')
            return [
                Message(
                    role=role,  # type: ignore[arg-type]
                    content=content,
                )
            ]
        elif isinstance(action, CmdRunAction) and action.source == 'user':
            content = [
                TextContent(text=f'User executed the command:\n{action.command}')
            ]
            return [
                Message(
                    role='user',  # Always user for CmdRunAction
                    content=content,
                )
            ]
        elif isinstance(action, SystemMessageAction):
            # Convert SystemMessageAction to a system message
            return [
                Message(
                    role='system',
                    content=[TextContent(text=action.content)],
                    # Include tools if function calling is enabled
                    tool_calls=None,
                )
            ]
        return []

    def _process_observation(
        self,
        obs: Observation,
        tool_call_id_to_message: dict[str, Message],
        max_message_chars: int | None = None,
        vision_is_active: bool = False,
        enable_som_visual_browsing: bool = False,
        current_index: int = 0,
        events: list[Event] | None = None,
    ) -> list[Message]:
        """Converts an observation into a message format that can be sent to the LLM.

        This method handles different types of observations and formats them appropriately:
        - CmdOutputObservation: Formats command execution results with exit codes
        - IPythonRunCellObservation: Formats IPython cell execution results, replacing base64 images
        - FileEditObservation: Formats file editing results
        - FileReadObservation: Formats file reading results from openhands-aci
        - AgentDelegateObservation: Formats results from delegated agent tasks
        - ErrorObservation: Formats error messages from failed actions
        - UserRejectObservation: Formats user rejection messages

        In function calling mode, observations with tool_call_metadata are stored in
        tool_call_id_to_message for later processing instead of being returned immediately.

        Args:
            obs: The observation to convert
            tool_call_id_to_message: Dictionary mapping tool call IDs to their corresponding messages (used in function calling mode)
            max_message_chars: The maximum number of characters in the content of an observation included in the prompt to the LLM
            vision_is_active: Whether vision is active in the LLM. If True, image URLs will be included
            enable_som_visual_browsing: Whether to enable visual browsing for the SOM model
            current_index: The index of the current event in the events list (for deduplication)
            events: The list of all events (for deduplication)

        Returns:
            list[Message]: A list containing the formatted message(s) for the observation.
                May be empty if the observation is handled as a tool response in function calling mode.

        Raises:
            ValueError: If the observation type is unknown
        """
        message: Message

        if isinstance(obs, CmdOutputObservation):
            # if it doesn't have tool call metadata, it was triggered by a user action
            if obs.tool_call_metadata is None:
                text = truncate_content(
                    f'\nObserved result of command executed by user:\n{obs.to_agent_observation()}',
                    max_message_chars,
                )
            else:
                text = truncate_content(obs.to_agent_observation(), max_message_chars)
            message = Message(role='user', content=[TextContent(text=text)])
        elif isinstance(obs, MCPObservation):
            # logger.warning(f'MCPObservation: {obs}')
            text = truncate_content(obs.content, max_message_chars)
            message = Message(role='user', content=[TextContent(text=text)])
        elif isinstance(obs, IPythonRunCellObservation):
            text = obs.content
            # Clean up any remaining base64 images in text content
            splitted = text.split('\n')
            for i, line in enumerate(splitted):
                if '![image](data:image/png;base64,' in line:
                    splitted[i] = (
                        '![image](data:image/png;base64, ...) already displayed to user'
                    )
            text = '\n'.join(splitted)
            text = truncate_content(text, max_message_chars)

            # Create message content with text
            content = [TextContent(text=text)]

            # Add image URLs if available and vision is active
            if vision_is_active and obs.image_urls:
                content.append(ImageContent(image_urls=obs.image_urls))

            message = Message(role='user', content=content)
        elif isinstance(obs, FileEditObservation):
            text = truncate_content(str(obs), max_message_chars)
            message = Message(role='user', content=[TextContent(text=text)])
        elif isinstance(obs, FileReadObservation):
            message = Message(
                role='user', content=[TextContent(text=obs.content)]
            )  # Content is already truncated by openhands-aci
        elif isinstance(obs, BrowserOutputObservation):
            text = obs.get_agent_obs_text()
            if (
                obs.trigger_by_action == ActionType.BROWSE_INTERACTIVE
                and enable_som_visual_browsing
                and vision_is_active
            ):
                text += 'Image: Current webpage screenshot (Note that only visible portion of webpage is present in the screenshot. You may need to scroll to view the remaining portion of the web-page.)\n'
                message = Message(
                    role='user',
                    content=[
                        TextContent(text=text),
                        ImageContent(
                            image_urls=[
                                # show set of marks if it exists
                                # otherwise, show raw screenshot when using vision-supported model
                                obs.set_of_marks
                                if obs.set_of_marks is not None
                                and len(obs.set_of_marks) > 0
                                else obs.screenshot
                            ]
                        ),
                    ],
                )
                logger.debug(
                    f'Vision enabled for browsing, showing {"set of marks" if obs.set_of_marks and len(obs.set_of_marks) > 0 else "screenshot"}'
                )
            else:
                message = Message(
                    role='user',
                    content=[TextContent(text=text)],
                )
                logger.debug('Vision disabled for browsing, showing text')
        elif isinstance(obs, AgentDelegateObservation):
            text = truncate_content(
                obs.outputs.get('content', obs.content),
                max_message_chars,
            )
            message = Message(role='user', content=[TextContent(text=text)])
        elif isinstance(obs, AgentThinkObservation):
            text = truncate_content(obs.content, max_message_chars)
            message = Message(role='user', content=[TextContent(text=text)])
        elif isinstance(obs, ErrorObservation):
            text = truncate_content(obs.content, max_message_chars)
            text += '\n[Error occurred in processing last action]'
            message = Message(role='user', content=[TextContent(text=text)])
        elif isinstance(obs, UserRejectObservation):
            text = 'OBSERVATION:\n' + truncate_content(obs.content, max_message_chars)
            text += '\n[Last action has been rejected by the user]'
            message = Message(role='user', content=[TextContent(text=text)])
        elif isinstance(obs, AgentCondensationObservation):
            text = truncate_content(obs.content, max_message_chars)
            message = Message(role='user', content=[TextContent(text=text)])
        elif (
            isinstance(obs, RecallObservation)
            and self.agent_config.enable_prompt_extensions
        ):
            if obs.recall_type == RecallType.WORKSPACE_CONTEXT:
                # everything is optional, check if they are present
                if obs.repo_name or obs.repo_directory:
                    repo_info = RepositoryInfo(
                        repo_name=obs.repo_name or '',
                        repo_directory=obs.repo_directory or '',
                    )
                else:
                    repo_info = None

                date = obs.date

                if obs.runtime_hosts or obs.additional_agent_instructions:
                    runtime_info = RuntimeInfo(
                        available_hosts=obs.runtime_hosts,
                        additional_agent_instructions=obs.additional_agent_instructions,
                        date=date,
                        custom_secrets_descriptions=obs.custom_secrets_descriptions,
                    )
                else:
                    runtime_info = RuntimeInfo(
                        date=date,
                        custom_secrets_descriptions=obs.custom_secrets_descriptions,
                    )

                conversation_instructions = None

                if obs.conversation_instructions:
                    conversation_instructions = ConversationInstructions(
                        content=obs.conversation_instructions
                    )

                repo_instructions = (
                    obs.repo_instructions if obs.repo_instructions else ''
                )

                # Have some meaningful content before calling the template
                has_repo_info = repo_info is not None and (
                    repo_info.repo_name or repo_info.repo_directory
                )
                has_runtime_info = runtime_info is not None and (
                    runtime_info.date or runtime_info.custom_secrets_descriptions
                )
                has_repo_instructions = bool(repo_instructions.strip())
                has_conversation_instructions = conversation_instructions is not None

                # Filter and process microagent knowledge
                filtered_agents = []
                if obs.microagent_knowledge:
                    # Exclude disabled microagents
                    filtered_agents = [
                        agent
                        for agent in obs.microagent_knowledge
                        if agent.name not in self.agent_config.disabled_microagents
                    ]

                has_microagent_knowledge = bool(filtered_agents)

                # Generate appropriate content based on what is present
                message_content = []

                # Build the workspace context information
                if (
                    has_repo_info
                    or has_runtime_info
                    or has_repo_instructions
                    or has_conversation_instructions
                ):
                    formatted_workspace_text = (
                        self.prompt_manager.build_workspace_context(
                            repository_info=repo_info,
                            runtime_info=runtime_info,
                            conversation_instructions=conversation_instructions,
                            repo_instructions=repo_instructions,
                        )
                    )
                    message_content.append(TextContent(text=formatted_workspace_text))

                # Add microagent knowledge if present
                if has_microagent_knowledge:
                    formatted_microagent_text = (
                        self.prompt_manager.build_microagent_info(
                            triggered_agents=filtered_agents,
                        )
                    )
                    message_content.append(TextContent(text=formatted_microagent_text))

                # Return the combined message if we have any content
                if message_content:
                    message = Message(role='user', content=message_content)
                else:
                    return []
            elif obs.recall_type == RecallType.KNOWLEDGE:
                # Use prompt manager to build the microagent info
                # First, filter out agents that appear in earlier RecallObservations
                filtered_agents = self._filter_agents_in_microagent_obs(
                    obs, current_index, events or []
                )

                # Create and return a message if there is microagent knowledge to include
                if filtered_agents:
                    # Exclude disabled microagents
                    filtered_agents = [
                        agent
                        for agent in filtered_agents
                        if agent.name not in self.agent_config.disabled_microagents
                    ]

                    # Only proceed if we still have agents after filtering out disabled ones
                    if filtered_agents:
                        formatted_text = self.prompt_manager.build_microagent_info(
                            triggered_agents=filtered_agents,
                        )

                        return [
                            Message(
                                role='user', content=[TextContent(text=formatted_text)]
                            )
                        ]

                # Return empty list if no microagents to include or all were disabled
                return []
        elif (
            isinstance(obs, RecallObservation)
            and not self.agent_config.enable_prompt_extensions
        ):
            # If prompt extensions are disabled, we don't add any additional info
            # TODO: test this
            return []
        else:
            # If an observation message is not returned, it will cause an error
            # when the LLM tries to return the next message
            raise ValueError(f'Unknown observation type: {type(obs)}')

        # Update the message as tool response properly
        if (tool_call_metadata := getattr(obs, 'tool_call_metadata', None)) is not None:
            tool_call_id_to_message[tool_call_metadata.tool_call_id] = Message(
                role='tool',
                content=message.content,
                tool_call_id=tool_call_metadata.tool_call_id,
                name=tool_call_metadata.function_name,
            )
            # No need to return the observation message
            # because it will be added by get_action_message when all the corresponding
            # tool calls in the SAME request are processed
            return []

        return [message]

    def apply_prompt_caching(self, messages: list[Message]) -> None:
        """Applies caching breakpoints to the messages.

        For new Anthropic API, we only need to mark the last user or tool message as cacheable.
        """
        if len(messages) > 0 and messages[0].role == 'system':
            messages[0].content[-1].cache_prompt = True
        # NOTE: this is only needed for anthropic
        for message in reversed(messages):
            if message.role in ('user', 'tool'):
                message.content[
                    -1
                ].cache_prompt = True  # Last item inside the message content
                break

    def _filter_agents_in_microagent_obs(
        self, obs: RecallObservation, current_index: int, events: list[Event]
    ) -> list[MicroagentKnowledge]:
        """Filter out agents that appear in earlier RecallObservations.

        Args:
            obs: The current RecallObservation to filter
            current_index: The index of the current event in the events list
            events: The list of all events

        Returns:
            list[MicroagentKnowledge]: The filtered list of microagent knowledge
        """
        if obs.recall_type != RecallType.KNOWLEDGE:
            return obs.microagent_knowledge

        # For each agent in the current microagent observation, check if it appears in any earlier microagent observation
        filtered_agents = []
        for agent in obs.microagent_knowledge:
            # Keep this agent if it doesn't appear in any earlier observation
            # that is, if this is the first microagent observation with this microagent
            if not self._has_agent_in_earlier_events(agent.name, current_index, events):
                filtered_agents.append(agent)

        return filtered_agents

    def _has_agent_in_earlier_events(
        self, agent_name: str, current_index: int, events: list[Event]
    ) -> bool:
        """Check if an agent appears in any earlier RecallObservation in the event list.

        Args:
            agent_name: The name of the agent to look for
            current_index: The index of the current event in the events list
            events: The list of all events

        Returns:
            bool: True if the agent appears in an earlier RecallObservation, False otherwise
        """
        for event in events[:current_index]:
            # Note that this check includes the WORKSPACE_CONTEXT
            if isinstance(event, RecallObservation):
                if any(
                    agent.name == agent_name for agent in event.microagent_knowledge
                ):
                    return True
        return False

    @staticmethod
    def _filter_unmatched_tool_calls(
        messages: list[Message],
    ) -> Generator[Message, None, None]:
        """Filter out tool calls that don't have matching tool responses and vice versa.

        This ensures that every tool_call_id in a tool message has a corresponding tool_calls[].id
        in an assistant message, and vice versa. The original list is unmodified, when tool_calls is
        updated the message is copied.

        This does not remove items with id set to None.
        """
        tool_call_ids = {
            tool_call.id
            for message in messages
            if message.tool_calls
            for tool_call in message.tool_calls
            if message.role == 'assistant' and tool_call.id
        }
        tool_response_ids = {
            message.tool_call_id
            for message in messages
            if message.role == 'tool' and message.tool_call_id
        }

        for message in messages:
            # Remove tool messages with no matching assistant tool call
            if message.role == 'tool' and message.tool_call_id:
                if message.tool_call_id in tool_call_ids:
                    yield message

            # Remove assistant tool calls with no matching tool response
            elif message.role == 'assistant' and message.tool_calls:
                all_tool_calls_match = all(
                    tool_call.id in tool_response_ids
                    for tool_call in message.tool_calls
                )
                if all_tool_calls_match:
                    yield message
                else:
                    matched_tool_calls = [
                        tool_call
                        for tool_call in message.tool_calls
                        if tool_call.id in tool_response_ids
                    ]

                    if matched_tool_calls:
                        # Keep an updated message if there are tools calls left
                        yield message.model_copy(
                            update={'tool_calls': matched_tool_calls}
                        )
            else:
                # Any other case is kept
                yield message

    def _ensure_system_message(self, events: list[Event]) -> None:
        """Checks if a SystemMessageAction exists and adds one if not (for legacy compatibility)."""
        # Check if there's a SystemMessageAction in the events
        has_system_message = any(
            isinstance(event, SystemMessageAction) for event in events
        )

        # Legacy behavior: If no SystemMessageAction is found, add one
        if not has_system_message:
            logger.debug(
                '[ConversationMemory] No SystemMessageAction found in events. '
                'Adding one for backward compatibility. '
            )
            system_prompt = self.prompt_manager.get_system_message()
            if system_prompt:
                system_message = SystemMessageAction(content=system_prompt)
                # Insert the system message directly at the beginning of the events list
                events.insert(0, system_message)
                logger.info(
                    '[ConversationMemory] Added SystemMessageAction for backward compatibility'
                )

    def _ensure_initial_user_message(
        self, events: list[Event], initial_user_action: MessageAction
    ) -> None:
        """Checks if the second event is a user MessageAction and inserts the provided one if needed."""
        if (
            not events
        ):  # Should have system message from previous step, but safety check
            logger.error('Cannot ensure initial user message: event list is empty.')
            # Or raise? Let's log for now, _ensure_system_message should handle this.
            return

        # We expect events[0] to be SystemMessageAction after _ensure_system_message
        if len(events) == 1:
            # Only system message exists
            logger.info(
                'Initial user message action was missing. Inserting the initial user message.'
            )
            events.insert(1, initial_user_action)
        elif not isinstance(events[1], MessageAction) or events[1].source != 'user':
            # The second event exists but is not the correct initial user message action.
            # We will insert the correct one provided.
            logger.info(
                'Second event was not the initial user message action. Inserting correct one at index 1.'
            )

            # Insert the user message event at index 1. This will be the second message as LLM APIs expect
            # but something was wrong with the history, so log all we can.
            events.insert(1, initial_user_action)

        # Else: events[1] is already a user MessageAction.
        # Check if it matches the one provided (if any discrepancy, log warning but proceed).
        elif events[1] != initial_user_action:
            logger.debug(
                'The user MessageAction at index 1 does not match the provided initial_user_action. '
                'Proceeding with the one found in condensed history.'
            )