Spaces:
Build error
Build error
################################################################################################## | |
# Adapted from https://github.com/TheAgentCompany/TheAgentCompany/blob/main/evaluation/run_eval.py | |
################################################################################################## | |
import asyncio | |
import base64 | |
import json | |
import os | |
import shutil | |
import tempfile | |
import yaml | |
from browsing import pre_login | |
from evaluation.utils.shared import get_default_sandbox_config_for_eval | |
from openhands.controller.state.state import State | |
from openhands.core.config import ( | |
LLMConfig, | |
OpenHandsConfig, | |
get_agent_config_arg, | |
get_llm_config_arg, | |
get_parser, | |
) | |
from openhands.core.config.agent_config import AgentConfig | |
from openhands.core.logger import openhands_logger as logger | |
from openhands.core.main import create_runtime, run_controller | |
from openhands.events.action import CmdRunAction, MessageAction | |
from openhands.events.observation import BrowserOutputObservation, CmdOutputObservation | |
from openhands.runtime.base import Runtime | |
from openhands.utils.async_utils import call_async_from_sync | |
def get_config( | |
base_container_image: str, | |
task_short_name: str, | |
mount_path_on_host: str, | |
llm_config: LLMConfig, | |
agent_config: AgentConfig | None, | |
) -> OpenHandsConfig: | |
sandbox_config = get_default_sandbox_config_for_eval() | |
sandbox_config.base_container_image = base_container_image | |
sandbox_config.enable_auto_lint = True | |
# If the web services are running on the host machine, this must be set to True | |
sandbox_config.use_host_network = True | |
config = OpenHandsConfig( | |
run_as_openhands=False, | |
max_budget_per_task=4, | |
max_iterations=100, | |
save_trajectory_path=os.path.join( | |
mount_path_on_host, f'traj_{task_short_name}.json' | |
), | |
sandbox=sandbox_config, | |
# we mount trajectories path so that trajectories, generated by OpenHands | |
# controller, can be accessible to the evaluator file in the runtime container | |
workspace_mount_path=mount_path_on_host, | |
workspace_mount_path_in_sandbox='/outputs', | |
) | |
config.set_llm_config(llm_config) | |
if agent_config: | |
config.set_agent_config(agent_config) | |
else: | |
logger.info('Agent config not provided, using default settings') | |
agent_config = AgentConfig( | |
enable_prompt_extensions=False, | |
) | |
config.set_agent_config(agent_config) | |
return config | |
def load_dependencies(runtime: Runtime) -> list[str]: | |
""" | |
Every task has a dependencies.yml file, which lists all the services that the | |
task depends on. This function loads the file and returns all dependent service names. | |
""" | |
command = 'cat /utils/dependencies.yml' | |
action = CmdRunAction(command=command) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs: CmdOutputObservation = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert obs.exit_code == 0 | |
dependencies = yaml.safe_load(obs.content) | |
if dependencies is None: | |
dependencies = [] | |
return dependencies | |
def init_task_env(runtime: Runtime, hostname: str, env_llm_config: LLMConfig): | |
command = ( | |
f'SERVER_HOSTNAME={hostname} ' | |
f'LITELLM_API_KEY={env_llm_config.api_key.get_secret_value() if env_llm_config.api_key else None} ' | |
f'LITELLM_BASE_URL={env_llm_config.base_url} ' | |
f'LITELLM_MODEL={env_llm_config.model} ' | |
'bash /utils/init.sh' | |
) | |
action = CmdRunAction(command=command) | |
action.set_hard_timeout(900) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert obs.exit_code == 0 | |
def codeact_user_response(state: State) -> str: | |
msg = ( | |
'Please continue working on the task on whatever approach you think is suitable.\n' | |
'If you think you have solved the task, please finish the interaction.\n' | |
'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP.\n' | |
) | |
if state.history: | |
# check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up | |
user_msgs = [ | |
event | |
for event in state.history | |
if isinstance(event, MessageAction) and event.source == 'user' | |
] | |
if len(user_msgs) >= 2: | |
# let the agent know that it can give up when it has tried 3 times | |
return ( | |
msg | |
+ 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n' | |
) | |
return msg | |
def run_solver( | |
runtime: Runtime, | |
task_name: str, | |
config: OpenHandsConfig, | |
dependencies: list[str], | |
save_final_state: bool, | |
state_dir: str, | |
save_screenshots: bool, | |
screenshots_dir: str, | |
) -> State: | |
instruction = 'Complete the task in /instruction/task.md' | |
if 'gitlab' in dependencies: | |
instruction += "\n\nGitlab username is 'root' and password is 'theagentcompany'" | |
state: State | None = asyncio.run( | |
run_controller( | |
config=config, | |
sid=task_name, | |
initial_user_action=MessageAction(content=instruction), | |
runtime=runtime, | |
fake_user_response_fn=codeact_user_response, | |
) | |
) | |
logger.info(state) | |
if save_screenshots: | |
screenshots_dir = os.path.join(screenshots_dir, task_name) | |
os.makedirs(screenshots_dir, exist_ok=True) | |
for image_id, obs in enumerate(state.history): | |
if isinstance(obs, BrowserOutputObservation): | |
image_data = base64.b64decode( | |
obs.screenshot.replace('data:image/png;base64,', '') | |
) | |
with open( | |
os.path.join(screenshots_dir, f'{image_id}.png'), 'wb' | |
) as file: | |
file.write(image_data) | |
if obs.set_of_marks: | |
som_image_data = base64.b64decode( | |
obs.set_of_marks.replace('data:image/png;base64,', '') | |
) | |
with open( | |
os.path.join(screenshots_dir, f'{image_id}_som.png'), 'wb' | |
) as file: | |
file.write(som_image_data) | |
if save_final_state: | |
os.makedirs(state_dir, exist_ok=True) | |
with open(os.path.join(state_dir, f'state_{task_name}.json'), 'w') as file: | |
json.dump(str(state), file) | |
return state | |
def run_evaluator( | |
runtime: Runtime, env_llm_config: LLMConfig, trajectory_path: str, result_path: str | |
): | |
command = ( | |
f'LITELLM_API_KEY={env_llm_config.api_key.get_secret_value() if env_llm_config.api_key else None} ' | |
f'LITELLM_BASE_URL={env_llm_config.base_url} ' | |
f'LITELLM_MODEL={env_llm_config.model} ' | |
f"DECRYPTION_KEY='theagentcompany is all you need' " # Hardcoded Key | |
f'python_default /utils/eval.py --trajectory_path {trajectory_path} --result_path {result_path}' | |
) | |
action = CmdRunAction(command=command) | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert obs.exit_code == 0 | |
if __name__ == '__main__': | |
parser = get_parser() | |
parser.add_argument( | |
'--task-image-name', | |
type=str, | |
default='ghcr.io/theagentcompany/example-image:1.0.0', | |
help='Task image name', | |
) | |
parser.add_argument( | |
'--outputs-path', | |
type=str, | |
default='./outputs', | |
help='Folder path to save trajectories and evaluation results', | |
) | |
parser.add_argument( | |
'--server-hostname', | |
type=str, | |
default='localhost', | |
help='Server hostname, e.g. localhost to access the host machine from the container, ' | |
'assuming the task docker container is run with `--network host` flag', | |
) | |
parser.add_argument( | |
'--agent-llm-config', | |
type=str, | |
default=None, | |
help='LLM config for agent', | |
) | |
parser.add_argument( | |
'--env-llm-config', | |
type=str, | |
default=None, | |
help='LLM config for evaluation environment (NPC & llm-based evaluator)', | |
) | |
args, _ = parser.parse_known_args() | |
agent_config: AgentConfig | None = None | |
if args.agent_config: | |
agent_config = get_agent_config_arg(args.agent_config) | |
agent_llm_config: LLMConfig | None = None | |
if args.agent_llm_config: | |
agent_llm_config = get_llm_config_arg(args.agent_llm_config) | |
if agent_llm_config is None: | |
raise ValueError( | |
f'Could not find LLM config for agent: --agent-llm-config {args.agent_llm_config}' | |
) | |
if agent_llm_config.api_key is None: | |
raise ValueError('LLM API key is not set for agent') | |
env_llm_config: LLMConfig | None = None | |
if args.env_llm_config: | |
env_llm_config = get_llm_config_arg(args.env_llm_config) | |
if env_llm_config is None: | |
raise ValueError( | |
f'Could not find LLM config for evaluation environment: --env-llm-config {args.env_llm_config}' | |
) | |
if env_llm_config.api_key is None: | |
raise ValueError('LLM API key is not set for evaluation environment') | |
task_short_name = args.task_image_name.split('/')[-1].split(':')[0] | |
logger.info( | |
f'Task image name is {args.task_image_name}, short name is {task_short_name}' | |
) | |
# mount a temporary directory to pass trajectory from host to container, and to | |
# pass the evaluation result from container to host | |
# 1) trajectory is dumped by OpenHands library (on host machine), but it's needed by | |
# evaluator (in container), so we mount a temporary directory to pass it in | |
# 2) evaluation result is written by evaluator (in container), but we need to persist | |
# it on host machine, so we mount a temporary directory to pass it out | |
if os.getenv('TMPDIR') and os.path.exists(os.getenv('TMPDIR')): | |
temp_dir = os.path.abspath(os.getenv('TMPDIR')) | |
else: | |
temp_dir = tempfile.mkdtemp() | |
config: OpenHandsConfig = get_config( | |
args.task_image_name, task_short_name, temp_dir, agent_llm_config, agent_config | |
) | |
runtime: Runtime = create_runtime(config) | |
call_async_from_sync(runtime.connect) | |
init_task_env(runtime, args.server_hostname, env_llm_config) | |
dependencies = load_dependencies(runtime) | |
logger.info(f'Service dependencies: {dependencies}') | |
try: | |
pre_login( | |
runtime, | |
dependencies, | |
save_screenshots=True, | |
screenshots_dir=os.path.join( | |
os.path.abspath(args.outputs_path), 'screenshots' | |
), | |
) | |
except Exception as e: | |
logger.error(f'Failed to pre-login: {e}') | |
# before giving up, let's try to init and login again | |
init_task_env(runtime, args.server_hostname, env_llm_config) | |
pre_login( | |
runtime, | |
dependencies, | |
save_screenshots=True, | |
screenshots_dir=os.path.join( | |
os.path.abspath(args.outputs_path), 'screenshots' | |
), | |
) | |
state = run_solver( | |
runtime, | |
task_short_name, | |
config, | |
dependencies, | |
save_final_state=True, | |
state_dir=os.path.abspath(args.outputs_path), | |
save_screenshots=True, | |
screenshots_dir=os.path.join(os.path.abspath(args.outputs_path), 'screenshots'), | |
) | |
# this path is the absolute path in the runtime container | |
trajectory_path = f'/outputs/traj_{task_short_name}.json' | |
result_path = f'/outputs/eval_{task_short_name}.json' | |
run_evaluator(runtime, env_llm_config, trajectory_path, result_path) | |
# finally, move trajectory file and evaluation result from mount path on host (temp dir) to outputs path | |
shutil.move( | |
os.path.join(temp_dir, f'traj_{task_short_name}.json'), | |
os.path.join( | |
os.path.abspath(args.outputs_path), f'traj_{task_short_name}.json' | |
), | |
) | |
shutil.move( | |
os.path.join(temp_dir, f'eval_{task_short_name}.json'), | |
os.path.join( | |
os.path.abspath(args.outputs_path), f'eval_{task_short_name}.json' | |
), | |
) | |