Spaces:
Build error
Build error
import asyncio | |
import copy | |
import json | |
import os | |
import tempfile | |
from typing import Any, Literal | |
import pandas as pd | |
import toml | |
from datasets import load_dataset | |
import openhands.agenthub | |
from evaluation.benchmarks.swe_bench.binary_patch_utils import ( | |
remove_binary_diffs, | |
remove_binary_files_from_git, | |
) | |
from evaluation.benchmarks.swe_bench.resource.mapping import ( | |
get_instance_resource_factor, | |
) | |
from evaluation.benchmarks.swe_bench.resource.swt_bench_constants import ( | |
MAP_REPO_TO_INSTALL, | |
MAP_REPO_TO_TEST_FRAMEWORK_VERBOSE, | |
MAP_VERSION_TO_INSTALL, | |
) | |
from evaluation.utils.shared import ( | |
EvalException, | |
EvalMetadata, | |
EvalOutput, | |
assert_and_raise, | |
codeact_user_response, | |
get_default_sandbox_config_for_eval, | |
get_metrics, | |
is_fatal_evaluation_error, | |
make_metadata, | |
prepare_dataset, | |
reset_logger_for_multiprocessing, | |
run_evaluation, | |
update_llm_config_for_completions_logging, | |
) | |
from openhands.controller.state.state import State | |
from openhands.core.config import ( | |
AgentConfig, | |
OpenHandsConfig, | |
get_llm_config_arg, | |
get_parser, | |
) | |
from openhands.core.config.condenser_config import NoOpCondenserConfig | |
from openhands.core.config.utils import get_condenser_config_arg | |
from openhands.core.logger import openhands_logger as logger | |
from openhands.core.main import create_runtime, run_controller | |
from openhands.critic import AgentFinishedCritic | |
from openhands.events.action import CmdRunAction, FileReadAction, MessageAction | |
from openhands.events.observation import ( | |
CmdOutputObservation, | |
ErrorObservation, | |
FileReadObservation, | |
) | |
from openhands.events.serialization.event import event_from_dict, event_to_dict | |
from openhands.runtime.base import Runtime | |
from openhands.utils.async_utils import call_async_from_sync | |
from openhands.utils.shutdown_listener import sleep_if_should_continue | |
USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true' | |
RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true' | |
BenchMode = Literal['swe', 'swt', 'swt-ci'] | |
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = { | |
'CodeActAgent': codeact_user_response, | |
} | |
def _get_swebench_workspace_dir_name(instance: pd.Series) -> str: | |
return f'{instance.repo}__{instance.version}'.replace('/', '__') | |
def get_instruction(instance: pd.Series, metadata: EvalMetadata) -> MessageAction: | |
workspace_dir_name = _get_swebench_workspace_dir_name(instance) | |
mode = metadata.details['mode'] | |
if mode.startswith('swt'): | |
test_instructions = ( | |
f'The following command can be used to run the tests: `{list(MAP_REPO_TO_TEST_FRAMEWORK_VERBOSE[instance.repo].values())[0]}`. Make sure they fail in the expected way.\n' | |
if mode.endswith('ci') | |
else '' | |
) | |
instruction = f"""\ | |
<uploaded_files> | |
/workspace/{workspace_dir_name} | |
</uploaded_files> | |
I've uploaded a python code repository in the directory {workspace_dir_name}. Consider the following issue description: | |
<issue_description> | |
{instance.problem_statement} | |
</issue_description> | |
Can you help me implement the necessary changes to the repository to test whether the issue in <issue_description> was resolved? | |
I will take care of all changes to any of the non-test files. This means you DON'T have to modify the actual logic and ONLY have to update test logic and tests! | |
Your task is to make the minimal changes to tests files in the /workspace directory to reproduce the issue in the <issue_description>, i.e., such that the generated tests fail in the current state (where the issue is unresolved) and pass when the issue will be resolved. | |
Follow these steps to reproduce the issue: | |
1. As a first step, it might be a good idea to explore the repo to familiarize yourself with its structure. | |
2. Create a script `reproduction.py` to reproduce the error and execute it with `python reproduction.py` using the BashTool, to confirm the error | |
3. Edit the sourcecode of the repo to integrate your reproduction script into the test framework | |
4. Run the test framework and make sure your tests fail! Only submit FAILING tests! Never submit passing tests. | |
{test_instructions}Your thinking should be thorough and so it's fine if it's very long. | |
""" | |
else: | |
instruction = f""" | |
<uploaded_files> | |
/workspace/{workspace_dir_name} | |
</uploaded_files> | |
I've uploaded a python code repository in the directory {workspace_dir_name}. Consider the following issue description: | |
<issue_description> | |
{instance.problem_statement} | |
</issue_description> | |
Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met? | |
I've already taken care of all changes to any of the test files described in the <issue_description>. This means you DON'T have to modify the testing logic or any of the tests in any way! | |
Also the development Python environment is already set up for you (i.e., all dependencies already installed), so you don't need to install other packages. | |
Your task is to make the minimal changes to non-test files in the /workspace/{workspace_dir_name} directory to ensure the <issue_description> is satisfied. | |
Follow these phases to resolve the issue: | |
Phase 1. READING: read the problem and reword it in clearer terms | |
1.1 If there are code or config snippets. Express in words any best practices or conventions in them. | |
1.2 Hightlight message errors, method names, variables, file names, stack traces, and technical details. | |
1.3 Explain the problem in clear terms. | |
1.4 Enumerate the steps to reproduce the problem. | |
1.5 Hightlight any best practices to take into account when testing and fixing the issue | |
Phase 2. RUNNING: install and run the tests on the repository | |
2.1 Follow the readme | |
2.2 Install the environment and anything needed | |
2.2 Iterate and figure out how to run the tests | |
Phase 3. EXPLORATION: find the files that are related to the problem and possible solutions | |
3.1 Use `grep` to search for relevant methods, classes, keywords and error messages. | |
3.2 Identify all files related to the problem statement. | |
3.3 Propose the methods and files to fix the issue and explain why. | |
3.4 From the possible file locations, select the most likely location to fix the issue. | |
Phase 4. TEST CREATION: before implementing any fix, create a script to reproduce and verify the issue. | |
4.1 Look at existing test files in the repository to understand the test format/structure. | |
4.2 Create a minimal reproduction script that reproduces the located issue. | |
4.3 Run the reproduction script to confirm you are reproducing the issue. | |
4.4 Adjust the reproduction script as necessary. | |
Phase 5. FIX ANALYSIS: state clearly the problem and how to fix it | |
5.1 State clearly what the problem is. | |
5.2 State clearly where the problem is located. | |
5.3 State clearly how the test reproduces the issue. | |
5.4 State clearly the best practices to take into account in the fix. | |
5.5 State clearly how to fix the problem. | |
Phase 6. FIX IMPLEMENTATION: Edit the source code to implement your chosen solution. | |
6.1 Make minimal, focused changes to fix the issue. | |
Phase 7. VERIFICATION: Test your implementation thoroughly. | |
7.1 Run your reproduction script to verify the fix works. | |
7.2 Add edge cases to your test script to ensure comprehensive coverage. | |
7.3 Run existing tests related to the modified code to ensure you haven't broken anything. | |
8. FINAL REVIEW: Carefully re-read the problem description and compare your changes with the base commit {instance['base_commit']}. | |
8.1 Ensure you've fully addressed all requirements. | |
8.2 Run any tests in the repository related to: | |
8.2.1 The issue you are fixing | |
8.2.2 The files you modified | |
8.2.3 The functions you changed | |
8.3 If any tests fail, revise your implementation until all tests pass | |
Be thorough in your exploration, testing, and reasoning. It's fine if your thinking process is lengthy - quality and completeness are more important than brevity. | |
""" | |
if RUN_WITH_BROWSING: | |
instruction += ( | |
'<IMPORTANT!>\nYou SHOULD NEVER attempt to browse the web. </IMPORTANT!>\n' | |
) | |
if 'image_assets' in instance: | |
assets = json.loads(instance['image_assets']) | |
assert 'problem_statement' in assets, ( | |
'problem_statement is required in image_assets' | |
) | |
image_urls = assets['problem_statement'] | |
return MessageAction(content=instruction, image_urls=image_urls) | |
return MessageAction(content=instruction) | |
# TODO: migrate all swe-bench docker to ghcr.io/openhands | |
DEFAULT_DOCKER_IMAGE_PREFIX = os.environ.get( | |
'EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xingyaoww/' | |
) | |
logger.info(f'Default docker image prefix: {DEFAULT_DOCKER_IMAGE_PREFIX}') | |
def get_instance_docker_image( | |
instance_id: str, | |
swebench_official_image: bool = False, | |
) -> str: | |
if swebench_official_image: | |
# Official SWE-Bench image | |
# swebench/sweb.eval.x86_64.django_1776_django-11333:v1 | |
docker_image_prefix = 'docker.io/swebench/' | |
repo, name = instance_id.split('__') | |
image_name = f'swebench/sweb.eval.x86_64.{repo}_1776_{name}:latest'.lower() | |
logger.debug(f'Using official SWE-Bench image: {image_name}') | |
return image_name | |
else: | |
# OpenHands version of the image | |
docker_image_prefix = DEFAULT_DOCKER_IMAGE_PREFIX | |
image_name = 'sweb.eval.x86_64.' + instance_id | |
image_name = image_name.replace( | |
'__', '_s_' | |
) # to comply with docker image naming convention | |
return (docker_image_prefix.rstrip('/') + '/' + image_name).lower() | |
def get_config( | |
instance: pd.Series, | |
metadata: EvalMetadata, | |
) -> OpenHandsConfig: | |
# We use a different instance image for the each instance of swe-bench eval | |
use_swebench_official_image = 'swe-gym' not in metadata.dataset.lower() | |
base_container_image = get_instance_docker_image( | |
instance['instance_id'], | |
swebench_official_image=use_swebench_official_image, | |
) | |
logger.info( | |
f'Using instance container image: {base_container_image}. ' | |
f'Please make sure this image exists. ' | |
f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.' | |
) | |
sandbox_config = get_default_sandbox_config_for_eval() | |
sandbox_config.base_container_image = base_container_image | |
sandbox_config.enable_auto_lint = True | |
sandbox_config.use_host_network = False | |
# Add platform to the sandbox config to solve issue 4401 | |
sandbox_config.platform = 'linux/amd64' | |
sandbox_config.remote_runtime_resource_factor = get_instance_resource_factor( | |
dataset_name=metadata.dataset, | |
instance_id=instance['instance_id'], | |
) | |
config = OpenHandsConfig( | |
default_agent=metadata.agent_class, | |
run_as_openhands=False, | |
max_iterations=metadata.max_iterations, | |
runtime=os.environ.get('RUNTIME', 'docker'), | |
sandbox=sandbox_config, | |
# do not mount workspace | |
workspace_base=None, | |
workspace_mount_path=None, | |
) | |
config.set_llm_config( | |
update_llm_config_for_completions_logging( | |
metadata.llm_config, metadata.eval_output_dir, instance['instance_id'] | |
) | |
) | |
agent_config = AgentConfig( | |
enable_jupyter=False, | |
enable_browsing=RUN_WITH_BROWSING, | |
enable_llm_editor=False, | |
enable_mcp=False, | |
condenser=metadata.condenser_config, | |
enable_prompt_extensions=False, | |
) | |
config.set_agent_config(agent_config) | |
return config | |
def initialize_runtime( | |
runtime: Runtime, | |
instance: pd.Series, # this argument is not required | |
metadata: EvalMetadata, | |
): | |
"""Initialize the runtime for the agent. | |
This function is called before the runtime is used to run the agent. | |
""" | |
logger.info('-' * 30) | |
logger.info('BEGIN Runtime Initialization Fn') | |
logger.info('-' * 30) | |
workspace_dir_name = _get_swebench_workspace_dir_name(instance) | |
obs: CmdOutputObservation | |
# Set instance id and git configuration | |
action = CmdRunAction( | |
command=f"""echo 'export SWE_INSTANCE_ID={instance['instance_id']}' >> ~/.bashrc && echo 'export PIP_CACHE_DIR=~/.cache/pip' >> ~/.bashrc && echo "alias git='git --no-pager'" >> ~/.bashrc && git config --global core.pager "" && git config --global diff.binary false""" | |
) | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
obs.exit_code == 0, | |
f'Failed to export SWE_INSTANCE_ID and configure git: {str(obs)}', | |
) | |
action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """) | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise(obs.exit_code == 0, f'Failed to export USER: {str(obs)}') | |
# inject the init script | |
script_dir = os.path.dirname(__file__) | |
# inject the instance info | |
action = CmdRunAction(command='mkdir -p /swe_util/eval_data/instances') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
obs.exit_code == 0, | |
f'Failed to create /swe_util/eval_data/instances: {str(obs)}', | |
) | |
swe_instance_json_name = 'swe-bench-instance.json' | |
with tempfile.TemporaryDirectory() as temp_dir: | |
# Construct the full path for the desired file name within the temporary directory | |
temp_file_path = os.path.join(temp_dir, swe_instance_json_name) | |
# Write to the file with the desired name within the temporary directory | |
with open(temp_file_path, 'w') as f: | |
if not isinstance(instance, dict): | |
json.dump([instance.to_dict()], f) | |
else: | |
json.dump([instance], f) | |
# Copy the file to the desired location | |
runtime.copy_to(temp_file_path, '/swe_util/eval_data/instances/') | |
# inject the instance swe entry | |
runtime.copy_to( | |
str(os.path.join(script_dir, 'scripts/setup/instance_swe_entry.sh')), | |
'/swe_util/', | |
) | |
action = CmdRunAction(command='cat ~/.bashrc') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise(obs.exit_code == 0, f'Failed to cat ~/.bashrc: {str(obs)}') | |
action = CmdRunAction(command='source ~/.bashrc') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
if isinstance(obs, ErrorObservation): | |
logger.error(f'Failed to source ~/.bashrc: {str(obs)}') | |
assert_and_raise(obs.exit_code == 0, f'Failed to source ~/.bashrc: {str(obs)}') | |
action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
obs.exit_code == 0, | |
f'Failed to source /swe_util/instance_swe_entry.sh: {str(obs)}', | |
) | |
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
obs.exit_code == 0, | |
f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}', | |
) | |
action = CmdRunAction(command='git reset --hard') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise(obs.exit_code == 0, f'Failed to git reset --hard: {str(obs)}') | |
action = CmdRunAction( | |
command='for remote_name in $(git remote); do git remote remove "${remote_name}"; done' | |
) | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}') | |
if metadata.details['mode'] == 'swt-ci': | |
# set up repo | |
setup_commands = [] | |
if instance['repo'] in MAP_REPO_TO_INSTALL: | |
setup_commands.append(MAP_REPO_TO_INSTALL[instance['repo']]) | |
# Run pre-install set up if provided | |
install = MAP_VERSION_TO_INSTALL.get(instance['repo'], {}).get( | |
instance['version'], [] | |
) | |
if 'pre_install' in install: | |
for pre_install in install['pre_install']: | |
setup_commands.append(pre_install) | |
if 'install' in install: | |
setup_commands.append(install['install']) | |
for command in setup_commands: | |
action = CmdRunAction(command=command) | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
if 'multimodal' not in metadata.dataset.lower(): | |
# Only for non-multimodal datasets, we need to activate the testbed environment for Python | |
# SWE-Bench multimodal datasets are not using the testbed environment | |
action = CmdRunAction(command='which python') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
obs.exit_code == 0 and 'testbed' in obs.content, | |
f'Expected to find python interpreter from testbed, but got: {str(obs)}', | |
) | |
logger.info('-' * 30) | |
logger.info('END Runtime Initialization Fn') | |
logger.info('-' * 30) | |
def complete_runtime( | |
runtime: Runtime, | |
instance: pd.Series, # this argument is not required, but it is used to get the workspace_dir_name | |
) -> dict[str, Any]: | |
"""Complete the runtime for the agent. | |
This function is called before the runtime is used to run the agent. | |
If you need to do something in the sandbox to get the correctness metric after | |
the agent has run, modify this function. | |
""" | |
logger.info('-' * 30) | |
logger.info('BEGIN Runtime Completion Fn') | |
logger.info('-' * 30) | |
obs: CmdOutputObservation | |
workspace_dir_name = _get_swebench_workspace_dir_name(instance) | |
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
if obs.exit_code == -1: | |
# The previous command is still running | |
# We need to kill previous command | |
logger.info('The previous command is still running, trying to kill it...') | |
action = CmdRunAction(command='C-c') | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
# Then run the command again | |
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
if obs.exit_code == -1: | |
# The previous command is still running | |
# We need to kill previous command | |
logger.info('The previous command is still running, trying to ctrl+z it...') | |
action = CmdRunAction(command='C-z') | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
# Then run the command again | |
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, | |
f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}', | |
) | |
action = CmdRunAction(command='git config --global core.pager ""') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, | |
f'Failed to git config --global core.pager "": {str(obs)}', | |
) | |
# First check for any git repositories in subdirectories | |
action = CmdRunAction(command='find . -type d -name .git -not -path "./.git"') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, | |
f'Failed to find git repositories: {str(obs)}', | |
) | |
git_dirs = [p for p in obs.content.strip().split('\n') if p] | |
if git_dirs: | |
# Remove all .git directories in subdirectories | |
for git_dir in git_dirs: | |
action = CmdRunAction(command=f'rm -rf "{git_dir}"') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, | |
f'Failed to remove git directory {git_dir}: {str(obs)}', | |
) | |
# add all files | |
action = CmdRunAction(command='git add -A') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, | |
f'Failed to git add -A: {str(obs)}', | |
) | |
# Remove binary files from git staging | |
action = CmdRunAction(command=remove_binary_files_from_git()) | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, | |
f'Failed to remove binary files: {str(obs)}', | |
) | |
n_retries = 0 | |
git_patch = None | |
while n_retries < 5: | |
action = CmdRunAction( | |
command=f'git diff --no-color --cached {instance["base_commit"]} > patch.diff' | |
) | |
action.set_hard_timeout(max(300 + 100 * n_retries, 600)) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
n_retries += 1 | |
if isinstance(obs, CmdOutputObservation): | |
if obs.exit_code == 0: | |
# Read the patch file | |
action = FileReadAction(path='patch.diff') | |
action.set_hard_timeout(max(300 + 100 * n_retries, 600)) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
if isinstance(obs, FileReadObservation): | |
git_patch = obs.content | |
break | |
elif isinstance(obs, ErrorObservation): | |
# Fall back to cat "patch.diff" to get the patch | |
assert 'File could not be decoded as utf-8' in obs.content | |
action = CmdRunAction(command='cat patch.diff') | |
action.set_hard_timeout(max(300 + 100 * n_retries, 600)) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
assert isinstance(obs, CmdOutputObservation) and obs.exit_code == 0 | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
git_patch = obs.content | |
break | |
else: | |
assert_and_raise(False, f'Unexpected observation type: {str(obs)}') | |
else: | |
logger.info('Failed to get git diff, retrying...') | |
sleep_if_should_continue(10) | |
elif isinstance(obs, ErrorObservation): | |
logger.error(f'Error occurred: {obs.content}. Retrying...') | |
sleep_if_should_continue(10) | |
else: | |
assert_and_raise(False, f'Unexpected observation type: {str(obs)}') | |
assert_and_raise(git_patch is not None, 'Failed to get git diff (None)') | |
# Remove binary diffs from the patch | |
git_patch = remove_binary_diffs(git_patch) | |
logger.info('-' * 30) | |
logger.info('END Runtime Completion Fn') | |
logger.info('-' * 30) | |
return {'git_patch': git_patch} | |
def process_instance( | |
instance: pd.Series, | |
metadata: EvalMetadata, | |
reset_logger: bool = True, | |
runtime_failure_count: int = 0, | |
) -> EvalOutput: | |
config = get_config(instance, metadata) | |
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation | |
if reset_logger: | |
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs') | |
reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir) | |
else: | |
logger.info(f'Starting evaluation for instance {instance.instance_id}.') | |
# Increase resource_factor with increasing attempt_id | |
if runtime_failure_count > 0: | |
config.sandbox.remote_runtime_resource_factor = min( | |
config.sandbox.remote_runtime_resource_factor * (2**runtime_failure_count), | |
8, | |
) | |
logger.warning( | |
f'This is the {runtime_failure_count + 1}th attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}' | |
) | |
metadata = copy.deepcopy(metadata) | |
metadata.details['runtime_failure_count'] = runtime_failure_count | |
metadata.details['remote_runtime_resource_factor'] = ( | |
config.sandbox.remote_runtime_resource_factor | |
) | |
runtime = create_runtime(config) | |
call_async_from_sync(runtime.connect) | |
try: | |
initialize_runtime(runtime, instance, metadata) | |
message_action = get_instruction(instance, metadata) | |
# Here's how you can run the agent (similar to the `main` function) and get the final task state | |
state: State | None = asyncio.run( | |
run_controller( | |
config=config, | |
initial_user_action=message_action, | |
runtime=runtime, | |
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[ | |
metadata.agent_class | |
], | |
) | |
) | |
# if fatal error, throw EvalError to trigger re-run | |
if is_fatal_evaluation_error(state.last_error): | |
raise EvalException('Fatal error detected: ' + state.last_error) | |
# ======= THIS IS SWE-Bench specific ======= | |
# Get git patch | |
return_val = complete_runtime(runtime, instance) | |
git_patch = return_val['git_patch'] | |
logger.info( | |
f'Got git diff for instance {instance.instance_id}:\n--------\n{git_patch}\n--------' | |
) | |
finally: | |
runtime.close() | |
# ========================================== | |
# ======= Attempt to evaluate the agent's edits ======= | |
# we use eval_infer.sh to evaluate the agent's edits, not here | |
# because the agent may alter the environment / testcases | |
test_result = { | |
'git_patch': git_patch, | |
} | |
# If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction) | |
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation. | |
if state is None: | |
raise ValueError('State should not be None.') | |
# NOTE: this is NO LONGER the event stream, but an agent history that includes delegate agent's events | |
histories = [event_to_dict(event) for event in state.history] | |
metrics = get_metrics(state) | |
# Save the output | |
instruction = message_action.content | |
if message_action.image_urls: | |
instruction += ( | |
'\n\n<image_urls>' + '\n'.join(message_action.image_urls) + '</image_urls>' | |
) | |
output = EvalOutput( | |
instance_id=instance.instance_id, | |
instruction=instruction, | |
instance=instance.to_dict(), # SWE Bench specific | |
test_result=test_result, | |
metadata=metadata, | |
history=histories, | |
metrics=metrics, | |
error=state.last_error if state and state.last_error else None, | |
) | |
return output | |
def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame: | |
file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.toml') | |
if os.path.exists(file_path): | |
with open(file_path, 'r') as file: | |
data = toml.load(file) | |
if 'selected_ids' in data: | |
selected_ids = data['selected_ids'] | |
logger.info( | |
f'Filtering {len(selected_ids)} tasks from "selected_ids"...' | |
) | |
subset = dataset[dataset[filter_column].isin(selected_ids)] | |
logger.info(f'Retained {subset.shape[0]} tasks after filtering') | |
return subset | |
if 'selected_repos' in data: | |
# repos for the swe-bench instances: | |
# ['astropy/astropy', 'django/django', 'matplotlib/matplotlib', 'mwaskom/seaborn', 'pallets/flask', 'psf/requests', 'pydata/xarray', 'pylint-dev/pylint', 'pytest-dev/pytest', 'scikit-learn/scikit-learn', 'sphinx-doc/sphinx', 'sympy/sympy'] | |
selected_repos = data['selected_repos'] | |
if isinstance(selected_repos, str): | |
selected_repos = [selected_repos] | |
assert isinstance(selected_repos, list) | |
logger.info( | |
f'Filtering {selected_repos} tasks from "selected_repos"...' | |
) | |
subset = dataset[dataset['repo'].isin(selected_repos)] | |
logger.info(f'Retained {subset.shape[0]} tasks after filtering') | |
return subset | |
skip_ids = os.environ.get('SKIP_IDS', '').split(',') | |
if len(skip_ids) > 0: | |
logger.info(f'Filtering {len(skip_ids)} tasks from "SKIP_IDS"...') | |
return dataset[~dataset[filter_column].isin(skip_ids)] | |
return dataset | |
if __name__ == '__main__': | |
parser = get_parser() | |
parser.add_argument( | |
'--dataset', | |
type=str, | |
default='princeton-nlp/SWE-bench', | |
help='data set to evaluate on, either full-test or lite-test', | |
) | |
parser.add_argument( | |
'--split', | |
type=str, | |
default='test', | |
help='split to evaluate on', | |
) | |
parser.add_argument( | |
'--mode', | |
type=str, | |
default='swe', | |
choices=['swe', 'swt', 'swt-ci'], | |
help="mode to run the evaluation, either 'swe', 'swt', or 'swt-ci'", | |
) | |
args, _ = parser.parse_known_args() | |
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing | |
# so we don't need to manage file uploading to OpenHands's repo | |
dataset = load_dataset(args.dataset, split=args.split) | |
swe_bench_tests = filter_dataset(dataset.to_pandas(), 'instance_id') | |
logger.info( | |
f'Loaded dataset {args.dataset} with split {args.split}: {len(swe_bench_tests)} tasks' | |
) | |
if 'SWE-Gym' in args.dataset: | |
with open( | |
os.path.join( | |
os.path.dirname(os.path.abspath(__file__)), | |
'split', | |
'swegym_verified_instances.json', | |
), | |
'r', | |
) as f: | |
swegym_verified_instances = json.load(f) | |
swe_bench_tests = swe_bench_tests[ | |
swe_bench_tests['instance_id'].isin(swegym_verified_instances) | |
] | |
logger.info( | |
f'{len(swe_bench_tests)} tasks left after filtering for SWE-Gym verified instances' | |
) | |
llm_config = None | |
if args.llm_config: | |
llm_config = get_llm_config_arg(args.llm_config) | |
llm_config.log_completions = True | |
# modify_params must be False for evaluation purpose, for reproducibility and accurancy of results | |
llm_config.modify_params = False | |
if llm_config is None: | |
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') | |
# Get condenser config from environment variable | |
condenser_name = os.environ.get('EVAL_CONDENSER') | |
if condenser_name: | |
condenser_config = get_condenser_config_arg(condenser_name) | |
if condenser_config is None: | |
raise ValueError( | |
f'Could not find Condenser config: EVAL_CONDENSER={condenser_name}' | |
) | |
else: | |
# If no specific condenser config is provided via env var, default to NoOpCondenser | |
condenser_config = NoOpCondenserConfig() | |
logger.debug( | |
'No Condenser config provided via EVAL_CONDENSER, using NoOpCondenser.' | |
) | |
details = {'mode': args.mode} | |
_agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls) | |
dataset_descrption = ( | |
args.dataset.replace('/', '__') + '-' + args.split.replace('/', '__') | |
) | |
metadata = make_metadata( | |
llm_config, | |
dataset_descrption, | |
args.agent_cls, | |
args.max_iterations, | |
args.eval_note, | |
args.eval_output_dir, | |
details=details, | |
condenser_config=condenser_config, | |
) | |
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') | |
print(f'### OUTPUT FILE: {output_file} ###') | |
# Run evaluation in iterative mode: | |
# If a rollout fails to output AgentFinishAction, we will try again until it succeeds OR total 3 attempts have been made. | |
ITERATIVE_EVAL_MODE = ( | |
os.environ.get('ITERATIVE_EVAL_MODE', 'false').lower() == 'true' | |
) | |
ITERATIVE_EVAL_MODE_MAX_ATTEMPTS = int( | |
os.environ.get('ITERATIVE_EVAL_MODE_MAX_ATTEMPTS', '3') | |
) | |
if not ITERATIVE_EVAL_MODE: | |
# load the dataset | |
instances = prepare_dataset(swe_bench_tests, output_file, args.eval_n_limit) | |
if len(instances) > 0 and not isinstance( | |
instances['PASS_TO_PASS'][instances['PASS_TO_PASS'].index[0]], str | |
): | |
for col in ['PASS_TO_PASS', 'FAIL_TO_PASS']: | |
instances[col] = instances[col].apply(lambda x: str(x)) | |
run_evaluation( | |
instances, | |
metadata, | |
output_file, | |
args.eval_num_workers, | |
process_instance, | |
timeout_seconds=8 | |
* 60 | |
* 60, # 8 hour PER instance should be more than enough | |
max_retries=5, | |
) | |
else: | |
critic = AgentFinishedCritic() | |
def get_cur_output_file_path(attempt: int) -> str: | |
return ( | |
f'{output_file.removesuffix(".jsonl")}.critic_attempt_{attempt}.jsonl' | |
) | |
eval_ids = None | |
for attempt in range(1, ITERATIVE_EVAL_MODE_MAX_ATTEMPTS + 1): | |
cur_output_file = get_cur_output_file_path(attempt) | |
logger.info( | |
f'Running evaluation with critic {critic.__class__.__name__} for attempt {attempt} of {ITERATIVE_EVAL_MODE_MAX_ATTEMPTS}.' | |
) | |
# For deterministic eval, we set temperature to 0.1 for (>1) attempt | |
# so hopefully we get slightly different results | |
if attempt > 1 and metadata.llm_config.temperature == 0: | |
logger.info( | |
f'Detected temperature is 0 for (>1) attempt {attempt}. Setting temperature to 0.1...' | |
) | |
metadata.llm_config.temperature = 0.1 | |
# Load instances - at first attempt, we evaluate all instances | |
# On subsequent attempts, we only evaluate the instances that failed the previous attempt determined by critic | |
instances = prepare_dataset( | |
swe_bench_tests, cur_output_file, args.eval_n_limit, eval_ids=eval_ids | |
) | |
if len(instances) > 0 and not isinstance( | |
instances['PASS_TO_PASS'][instances['PASS_TO_PASS'].index[0]], str | |
): | |
for col in ['PASS_TO_PASS', 'FAIL_TO_PASS']: | |
instances[col] = instances[col].apply(lambda x: str(x)) | |
# Run evaluation - but save them to cur_output_file | |
logger.info( | |
f'Evaluating {len(instances)} instances for attempt {attempt}...' | |
) | |
run_evaluation( | |
instances, | |
metadata, | |
cur_output_file, | |
args.eval_num_workers, | |
process_instance, | |
timeout_seconds=8 | |
* 60 | |
* 60, # 8 hour PER instance should be more than enough | |
max_retries=5, | |
) | |
# When eval is done, we update eval_ids to the instances that failed the current attempt | |
instances_failed = [] | |
logger.info( | |
f'Use critic {critic.__class__.__name__} to check {len(instances)} instances for attempt {attempt}...' | |
) | |
with open(cur_output_file, 'r') as f: | |
for line in f: | |
instance = json.loads(line) | |
try: | |
history = [ | |
event_from_dict(event) for event in instance['history'] | |
] | |
critic_result = critic.evaluate( | |
history, instance['test_result'].get('git_patch', '') | |
) | |
if not critic_result.success: | |
instances_failed.append(instance['instance_id']) | |
except Exception as e: | |
logger.error( | |
f'Error loading history for instance {instance["instance_id"]}: {e}' | |
) | |
instances_failed.append(instance['instance_id']) | |
logger.info( | |
f'{len(instances_failed)} instances failed the current attempt {attempt}: {instances_failed}' | |
) | |
eval_ids = instances_failed | |
# If no instances failed, we break | |
if len(instances_failed) == 0: | |
break | |
# Then we should aggregate the results from all attempts into the original output file | |
# and remove the intermediate files | |
logger.info( | |
'Aggregating results from all attempts into the original output file...' | |
) | |
fout = open(output_file, 'w') | |
added_instance_ids = set() | |
for attempt in reversed(range(1, ITERATIVE_EVAL_MODE_MAX_ATTEMPTS + 1)): | |
cur_output_file = get_cur_output_file_path(attempt) | |
if not os.path.exists(cur_output_file): | |
logger.warning( | |
f'Intermediate output file {cur_output_file} does not exist. Skipping...' | |
) | |
continue | |
with open(cur_output_file, 'r') as f: | |
for line in f: | |
instance = json.loads(line) | |
# Also make sure git_patch is not empty - otherwise we fall back to previous attempt (empty patch is worse than anything else) | |
if ( | |
instance['instance_id'] not in added_instance_ids | |
and instance['test_result'].get('git_patch', '').strip() | |
): | |
fout.write(line) | |
added_instance_ids.add(instance['instance_id']) | |
logger.info( | |
f'Aggregated instances from {cur_output_file}. Total instances added so far: {len(added_instance_ids)}' | |
) | |
fout.close() | |
logger.info( | |
f'Done! Total {len(added_instance_ids)} instances added to {output_file}' | |
) | |