File size: 5,504 Bytes
51ff9e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
"""Replay tests"""

import asyncio
from pathlib import Path

from conftest import _close_test_runtime, _load_runtime

from openhands.controller.state.state import State
from openhands.core.config.config_utils import OH_DEFAULT_AGENT
from openhands.core.config.openhands_config import OpenHandsConfig
from openhands.core.main import run_controller
from openhands.core.schema.agent import AgentState
from openhands.events.action.empty import NullAction
from openhands.events.action.message import MessageAction
from openhands.events.event import EventSource
from openhands.events.observation.commands import CmdOutputObservation


def _get_config(trajectory_name: str, agent: str = OH_DEFAULT_AGENT):
    return OpenHandsConfig(
        default_agent=agent,
        run_as_openhands=False,
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
        replay_trajectory_path=str(
            (Path(__file__).parent / 'trajs' / f'{trajectory_name}.json').resolve()
        ),
    )


def test_simple_replay(temp_dir, runtime_cls, run_as_openhands):
    """
    A simple replay test that involves simple terminal operations and edits
    (creating a simple 2048 game), using the default agent
    """
    runtime, config = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
    config.replay_trajectory_path = str(
        (Path(__file__).parent / 'trajs' / 'basic.json').resolve()
    )
    config.security.confirmation_mode = False

    state: State | None = asyncio.run(
        run_controller(
            config=config,
            initial_user_action=NullAction(),
            runtime=runtime,
        )
    )

    assert state.agent_state == AgentState.FINISHED

    _close_test_runtime(runtime)


def test_simple_gui_replay(temp_dir, runtime_cls, run_as_openhands):
    """
    A simple replay test that involves simple terminal operations and edits
    (writing a Vue.js App), using the default agent

    Note:
    1. This trajectory is exported from GUI mode, meaning it has extra
    environmental actions that don't appear in headless mode's trajectories
    2. In GUI mode, agents typically don't finish; rather, they wait for the next
    task from the user, so this exported trajectory ends with awaiting_user_input
    """
    runtime, config = _load_runtime(temp_dir, runtime_cls, run_as_openhands)

    config = _get_config('basic_gui_mode')
    config.security.confirmation_mode = False

    state: State | None = asyncio.run(
        run_controller(
            config=config,
            initial_user_action=NullAction(),
            runtime=runtime,
            # exit on message, otherwise this would be stuck on waiting for user input
            exit_on_message=True,
        )
    )

    assert state.agent_state == AgentState.FINISHED

    _close_test_runtime(runtime)


def test_replay_wrong_initial_state(temp_dir, runtime_cls, run_as_openhands):
    """
    Replay requires a consistent initial state to start with, otherwise it might
    be producing garbage. The trajectory used in this test assumes existence of
    a file named 'game_2048.py', which doesn't exist when we replay the trajectory
    (so called inconsistent initial states). This test demonstrates how this would
    look like: the following events would still be replayed even though they are
    meaningless.
    """
    runtime, config = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
    config.replay_trajectory_path = str(
        (Path(__file__).parent / 'trajs' / 'wrong_initial_state.json').resolve()
    )
    config.security.confirmation_mode = False

    state: State | None = asyncio.run(
        run_controller(
            config=config,
            initial_user_action=NullAction(),
            runtime=runtime,
        )
    )

    assert state.agent_state == AgentState.FINISHED

    has_error_in_action = False
    for event in state.history:
        if isinstance(event, CmdOutputObservation) and event.exit_code != 0:
            has_error_in_action = True
            break

    assert has_error_in_action

    _close_test_runtime(runtime)


def test_replay_basic_interactions(temp_dir, runtime_cls, run_as_openhands):
    """
    Replay a trajectory that involves interactions, i.e. with user messages
    in the middle. This tests two things:
    1) The controller should be able to replay all actions without human
    interference (no asking for user input).
    2) The user messages in the trajectory should appear in the history.
    """
    runtime, config = _load_runtime(temp_dir, runtime_cls, run_as_openhands)

    config = _get_config('basic_interactions')
    config.security.confirmation_mode = False

    state: State | None = asyncio.run(
        run_controller(
            config=config,
            initial_user_action=NullAction(),
            runtime=runtime,
        )
    )

    assert state.agent_state == AgentState.FINISHED

    # all user messages appear in the history, so that after a replay (assuming
    # the trajectory doesn't end with `finish` action), LLM knows about all the
    # context and can continue
    user_messages = [
        "what's 1+1?",
        "No, I mean by Goldbach's conjecture!",
        'Finish please',
    ]
    i = 0
    for event in state.history:
        if isinstance(event, MessageAction) and event._source == EventSource.USER:
            assert event.message == user_messages[i]
            i += 1
    assert i == len(user_messages)

    _close_test_runtime(runtime)