OpenHands / tests /runtime /test_replay.py
Backup-bdg's picture
Upload 964 files
51ff9e5 verified
raw
history blame
5.5 kB
"""Replay tests"""
import asyncio
from pathlib import Path
from conftest import _close_test_runtime, _load_runtime
from openhands.controller.state.state import State
from openhands.core.config.config_utils import OH_DEFAULT_AGENT
from openhands.core.config.openhands_config import OpenHandsConfig
from openhands.core.main import run_controller
from openhands.core.schema.agent import AgentState
from openhands.events.action.empty import NullAction
from openhands.events.action.message import MessageAction
from openhands.events.event import EventSource
from openhands.events.observation.commands import CmdOutputObservation
def _get_config(trajectory_name: str, agent: str = OH_DEFAULT_AGENT):
return OpenHandsConfig(
default_agent=agent,
run_as_openhands=False,
# do not mount workspace
workspace_base=None,
workspace_mount_path=None,
replay_trajectory_path=str(
(Path(__file__).parent / 'trajs' / f'{trajectory_name}.json').resolve()
),
)
def test_simple_replay(temp_dir, runtime_cls, run_as_openhands):
"""
A simple replay test that involves simple terminal operations and edits
(creating a simple 2048 game), using the default agent
"""
runtime, config = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
config.replay_trajectory_path = str(
(Path(__file__).parent / 'trajs' / 'basic.json').resolve()
)
config.security.confirmation_mode = False
state: State | None = asyncio.run(
run_controller(
config=config,
initial_user_action=NullAction(),
runtime=runtime,
)
)
assert state.agent_state == AgentState.FINISHED
_close_test_runtime(runtime)
def test_simple_gui_replay(temp_dir, runtime_cls, run_as_openhands):
"""
A simple replay test that involves simple terminal operations and edits
(writing a Vue.js App), using the default agent
Note:
1. This trajectory is exported from GUI mode, meaning it has extra
environmental actions that don't appear in headless mode's trajectories
2. In GUI mode, agents typically don't finish; rather, they wait for the next
task from the user, so this exported trajectory ends with awaiting_user_input
"""
runtime, config = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
config = _get_config('basic_gui_mode')
config.security.confirmation_mode = False
state: State | None = asyncio.run(
run_controller(
config=config,
initial_user_action=NullAction(),
runtime=runtime,
# exit on message, otherwise this would be stuck on waiting for user input
exit_on_message=True,
)
)
assert state.agent_state == AgentState.FINISHED
_close_test_runtime(runtime)
def test_replay_wrong_initial_state(temp_dir, runtime_cls, run_as_openhands):
"""
Replay requires a consistent initial state to start with, otherwise it might
be producing garbage. The trajectory used in this test assumes existence of
a file named 'game_2048.py', which doesn't exist when we replay the trajectory
(so called inconsistent initial states). This test demonstrates how this would
look like: the following events would still be replayed even though they are
meaningless.
"""
runtime, config = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
config.replay_trajectory_path = str(
(Path(__file__).parent / 'trajs' / 'wrong_initial_state.json').resolve()
)
config.security.confirmation_mode = False
state: State | None = asyncio.run(
run_controller(
config=config,
initial_user_action=NullAction(),
runtime=runtime,
)
)
assert state.agent_state == AgentState.FINISHED
has_error_in_action = False
for event in state.history:
if isinstance(event, CmdOutputObservation) and event.exit_code != 0:
has_error_in_action = True
break
assert has_error_in_action
_close_test_runtime(runtime)
def test_replay_basic_interactions(temp_dir, runtime_cls, run_as_openhands):
"""
Replay a trajectory that involves interactions, i.e. with user messages
in the middle. This tests two things:
1) The controller should be able to replay all actions without human
interference (no asking for user input).
2) The user messages in the trajectory should appear in the history.
"""
runtime, config = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
config = _get_config('basic_interactions')
config.security.confirmation_mode = False
state: State | None = asyncio.run(
run_controller(
config=config,
initial_user_action=NullAction(),
runtime=runtime,
)
)
assert state.agent_state == AgentState.FINISHED
# all user messages appear in the history, so that after a replay (assuming
# the trajectory doesn't end with `finish` action), LLM knows about all the
# context and can continue
user_messages = [
"what's 1+1?",
"No, I mean by Goldbach's conjecture!",
'Finish please',
]
i = 0
for event in state.history:
if isinstance(event, MessageAction) and event._source == EventSource.USER:
assert event.message == user_messages[i]
i += 1
assert i == len(user_messages)
_close_test_runtime(runtime)