Spaces:
Build error
Build error
File size: 11,824 Bytes
51ff9e5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 |
import os
import sys
from collections import deque
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from litellm import ChatCompletionToolParam
from openhands.events.action import Action
from openhands.llm.llm import ModelResponse
import openhands.agenthub.codeact_agent.function_calling as codeact_function_calling
from openhands.agenthub.codeact_agent.tools.bash import create_cmd_run_tool
from openhands.agenthub.codeact_agent.tools.browser import BrowserTool
from openhands.agenthub.codeact_agent.tools.finish import FinishTool
from openhands.agenthub.codeact_agent.tools.ipython import IPythonTool
from openhands.agenthub.codeact_agent.tools.llm_based_edit import LLMBasedFileEditTool
from openhands.agenthub.codeact_agent.tools.str_replace_editor import (
create_str_replace_editor_tool,
)
from openhands.agenthub.codeact_agent.tools.think import ThinkTool
from openhands.controller.agent import Agent
from openhands.controller.state.state import State
from openhands.core.config import AgentConfig
from openhands.core.logger import openhands_logger as logger
from openhands.core.message import Message
from openhands.events.action import AgentFinishAction, MessageAction
from openhands.events.event import Event
from openhands.llm.llm import LLM
from openhands.llm.llm_utils import check_tools
from openhands.memory.condenser import Condenser
from openhands.memory.condenser.condenser import Condensation, View
from openhands.memory.conversation_memory import ConversationMemory
from openhands.runtime.plugins import (
AgentSkillsRequirement,
JupyterRequirement,
PluginRequirement,
)
from openhands.utils.prompt import PromptManager
class CodeActAgent(Agent):
VERSION = '2.2'
"""
The Code Act Agent is a minimalist agent.
The agent works by passing the model a list of action-observation pairs and prompting the model to take the next step.
### Overview
This agent implements the CodeAct idea ([paper](https://arxiv.org/abs/2402.01030), [tweet](https://twitter.com/xingyaow_/status/1754556835703751087)) that consolidates LLM agents' **act**ions into a unified **code** action space for both *simplicity* and *performance* (see paper for more details).
The conceptual idea is illustrated below. At each turn, the agent can:
1. **Converse**: Communicate with humans in natural language to ask for clarification, confirmation, etc.
2. **CodeAct**: Choose to perform the task by executing code
- Execute any valid Linux `bash` command
- Execute any valid `Python` code with [an interactive Python interpreter](https://ipython.org/). This is simulated through `bash` command, see plugin system below for more details.

"""
sandbox_plugins: list[PluginRequirement] = [
# NOTE: AgentSkillsRequirement need to go before JupyterRequirement, since
# AgentSkillsRequirement provides a lot of Python functions,
# and it needs to be initialized before Jupyter for Jupyter to use those functions.
AgentSkillsRequirement(),
JupyterRequirement(),
]
def __init__(
self,
llm: LLM,
config: AgentConfig,
) -> None:
"""Initializes a new instance of the CodeActAgent class.
Parameters:
- llm (LLM): The llm to be used by this agent
- config (AgentConfig): The configuration for this agent
"""
super().__init__(llm, config)
self.pending_actions: deque['Action'] = deque()
self.reset()
self.tools = self._get_tools()
# Create a ConversationMemory instance
self.conversation_memory = ConversationMemory(self.config, self.prompt_manager)
self.condenser = Condenser.from_config(self.config.condenser)
logger.debug(f'Using condenser: {type(self.condenser)}')
@property
def prompt_manager(self) -> PromptManager:
if self._prompt_manager is None:
self._prompt_manager = PromptManager(
prompt_dir=os.path.join(os.path.dirname(__file__), 'prompts'),
)
return self._prompt_manager
def _get_tools(self) -> list['ChatCompletionToolParam']:
# For these models, we use short tool descriptions ( < 1024 tokens)
# to avoid hitting the OpenAI token limit for tool descriptions.
SHORT_TOOL_DESCRIPTION_LLM_SUBSTRS = ['gpt-', 'o3', 'o1', 'o4']
use_short_tool_desc = False
if self.llm is not None:
use_short_tool_desc = any(
model_substr in self.llm.config.model
for model_substr in SHORT_TOOL_DESCRIPTION_LLM_SUBSTRS
)
tools = []
if self.config.enable_cmd:
tools.append(create_cmd_run_tool(use_short_description=use_short_tool_desc))
if self.config.enable_think:
tools.append(ThinkTool)
if self.config.enable_finish:
tools.append(FinishTool)
if self.config.enable_browsing:
if sys.platform == 'win32':
logger.warning('Windows runtime does not support browsing yet')
else:
tools.append(BrowserTool)
if self.config.enable_jupyter:
tools.append(IPythonTool)
if self.config.enable_llm_editor:
tools.append(LLMBasedFileEditTool)
elif self.config.enable_editor:
tools.append(
create_str_replace_editor_tool(
use_short_description=use_short_tool_desc
)
)
return tools
def reset(self) -> None:
"""Resets the CodeAct Agent."""
super().reset()
self.pending_actions.clear()
def step(self, state: State) -> 'Action':
"""Performs one step using the CodeAct Agent.
This includes gathering info on previous steps and prompting the model to make a command to execute.
Parameters:
- state (State): used to get updated info
Returns:
- CmdRunAction(command) - bash command to run
- IPythonRunCellAction(code) - IPython code to run
- AgentDelegateAction(agent, inputs) - delegate action for (sub)task
- MessageAction(content) - Message action to run (e.g. ask for clarification)
- AgentFinishAction() - end the interaction
"""
# Continue with pending actions if any
if self.pending_actions:
return self.pending_actions.popleft()
# if we're done, go back
latest_user_message = state.get_last_user_message()
if latest_user_message and latest_user_message.content.strip() == '/exit':
return AgentFinishAction()
# Condense the events from the state. If we get a view we'll pass those
# to the conversation manager for processing, but if we get a condensation
# event we'll just return that instead of an action. The controller will
# immediately ask the agent to step again with the new view.
condensed_history: list[Event] = []
match self.condenser.condensed_history(state):
case View(events=events):
condensed_history = events
case Condensation(action=condensation_action):
return condensation_action
logger.debug(
f'Processing {len(condensed_history)} events from a total of {len(state.history)} events'
)
initial_user_message = self._get_initial_user_message(state.history)
messages = self._get_messages(condensed_history, initial_user_message)
params: dict = {
'messages': self.llm.format_messages_for_llm(messages),
}
params['tools'] = check_tools(self.tools, self.llm.config)
params['extra_body'] = {'metadata': state.to_llm_metadata(agent_name=self.name)}
response = self.llm.completion(**params)
logger.debug(f'Response from LLM: {response}')
actions = self.response_to_actions(response)
logger.debug(f'Actions after response_to_actions: {actions}')
for action in actions:
self.pending_actions.append(action)
return self.pending_actions.popleft()
def _get_initial_user_message(self, history: list[Event]) -> MessageAction:
"""Finds the initial user message action from the full history."""
initial_user_message: MessageAction | None = None
for event in history:
if isinstance(event, MessageAction) and event.source == 'user':
initial_user_message = event
break
if initial_user_message is None:
# This should not happen in a valid conversation
logger.error(
f'CRITICAL: Could not find the initial user MessageAction in the full {len(history)} events history.'
)
# Depending on desired robustness, could raise error or create a dummy action
# and log the error
raise ValueError(
'Initial user message not found in history. Please report this issue.'
)
return initial_user_message
def _get_messages(
self, events: list[Event], initial_user_message: MessageAction
) -> list[Message]:
"""Constructs the message history for the LLM conversation.
This method builds a structured conversation history by processing events from the state
and formatting them into messages that the LLM can understand. It handles both regular
message flow and function-calling scenarios.
The method performs the following steps:
1. Checks for SystemMessageAction in events, adds one if missing (legacy support)
2. Processes events (Actions and Observations) into messages, including SystemMessageAction
3. Handles tool calls and their responses in function-calling mode
4. Manages message role alternation (user/assistant/tool)
5. Applies caching for specific LLM providers (e.g., Anthropic)
6. Adds environment reminders for non-function-calling mode
Args:
events: The list of events to convert to messages
Returns:
list[Message]: A list of formatted messages ready for LLM consumption, including:
- System message with prompt (from SystemMessageAction)
- Action messages (from both user and assistant)
- Observation messages (including tool responses)
- Environment reminders (in non-function-calling mode)
Note:
- In function-calling mode, tool calls and their responses are carefully tracked
to maintain proper conversation flow
- Messages from the same role are combined to prevent consecutive same-role messages
- For Anthropic models, specific messages are cached according to their documentation
"""
if not self.prompt_manager:
raise Exception('Prompt Manager not instantiated.')
# Use ConversationMemory to process events (including SystemMessageAction)
messages = self.conversation_memory.process_events(
condensed_history=events,
initial_user_action=initial_user_message,
max_message_chars=self.llm.config.max_message_chars,
vision_is_active=self.llm.vision_is_active(),
)
if self.llm.is_caching_prompt_active():
self.conversation_memory.apply_prompt_caching(messages)
return messages
def response_to_actions(self, response: 'ModelResponse') -> list['Action']:
return codeact_function_calling.response_to_actions(
response,
mcp_tool_names=list(self.mcp_tools.keys()),
)
|