Spaces:
Build error
Build error
import asyncio | |
import json | |
import os | |
import tempfile | |
from typing import Any | |
import pandas as pd | |
import toml | |
from datasets import load_dataset | |
import openhands.agenthub | |
from evaluation.benchmarks.swe_bench.resource.mapping import ( | |
get_instance_resource_factor, | |
) | |
from evaluation.utils.shared import ( | |
EvalException, | |
EvalMetadata, | |
EvalOutput, | |
assert_and_raise, | |
codeact_user_response, | |
get_default_sandbox_config_for_eval, | |
get_metrics, | |
is_fatal_evaluation_error, | |
make_metadata, | |
prepare_dataset, | |
reset_logger_for_multiprocessing, | |
run_evaluation, | |
update_llm_config_for_completions_logging, | |
) | |
from openhands.controller.state.state import State | |
from openhands.core.config import ( | |
AgentConfig, | |
OpenHandsConfig, | |
get_llm_config_arg, | |
get_parser, | |
) | |
from openhands.core.logger import openhands_logger as logger | |
from openhands.core.main import create_runtime, run_controller | |
from openhands.events.action import CmdRunAction, MessageAction | |
from openhands.events.observation import CmdOutputObservation, ErrorObservation | |
from openhands.events.serialization.event import event_to_dict | |
from openhands.runtime.base import Runtime | |
from openhands.utils.async_utils import call_async_from_sync | |
from openhands.utils.shutdown_listener import sleep_if_should_continue | |
USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true' | |
RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true' | |
INDEX_BASE_DIR = os.environ.get('INDEX_BASE_DIR', '') | |
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = { | |
'CodeActAgent': codeact_user_response, | |
'LocAgent': codeact_user_response, | |
} | |
def _get_swebench_workspace_dir_name(instance: pd.Series) -> str: | |
return f'{instance.repo}__{instance.version}'.replace('/', '__') | |
def get_instruction(instance: pd.Series, metadata: EvalMetadata): | |
_get_swebench_workspace_dir_name(instance) | |
instruction = f""" | |
Consider the following issue description: | |
<issue_description> | |
{instance.problem_statement} | |
</issue_description> | |
Your objective is to localize the specific files, classes or functions, and lines of code that need modification or contain key information to resolve the issue. | |
Follow these steps to localize the issue: | |
## Step 1: Categorize and Extract Key Problem Information | |
- Classify the problem statement into the following categories: | |
Problem description, error trace, code to reproduce the bug, and additional context. | |
- Identify modules in the "{instance.instance_id.split('_')[0]}" package mentioned in each category. | |
- Use extracted keywords and line numbers to search for relevant code references for additional context. | |
## Step 2: Locate Referenced Modules | |
- Accurately determine specific modules | |
- Explore the repo to familiarize yourself with its structure. | |
- Analyze the described execution flow to identify specific modules or components being referenced. | |
- Pay special attention to distinguishing between modules with similar names using context and described execution flow. | |
- Output Format for collected relevant modules: | |
- Use the format: 'file_path:QualifiedName' | |
- E.g., for a function `calculate_sum` in the `MathUtils` class located in `src/helpers/math_helpers.py`, represent it as: 'src/helpers/math_helpers.py:MathUtils.calculate_sum'. | |
## Step 3: Analyze and Reproducing the Problem | |
- Clarify the Purpose of the Issue | |
- If expanding capabilities: Identify where and how to incorporate new behavior, fields, or modules. | |
- If addressing unexpected behavior: Focus on localizing modules containing potential bugs. | |
- Reconstruct the execution flow | |
- Identify main entry points triggering the issue. | |
- Trace function calls, class interactions, and sequences of events. | |
- Identify potential breakpoints causing the issue. | |
Important: Keep the reconstructed flow focused on the problem, avoiding irrelevant details. | |
## Step 4: Locate Areas for Modification | |
- Locate specific files, functions, or lines of code requiring changes or containing critical information for resolving the issue. | |
- Consider upstream and downstream dependencies that may affect or be affected by the issue. | |
- If applicable, identify where to introduce new fields, functions, or variables. | |
- Think Thoroughly: List multiple potential solutions and consider edge cases that could impact the resolution. | |
## Output Format for Final Results: | |
Your final output should list the locations requiring modification, wrapped with triple backticks ``` | |
Each location should include the file path, class name (if applicable), function name, or line numbers, ordered by importance. | |
Your answer would better include about 5 files. | |
### Examples: | |
``` | |
full_path1/file1.py | |
line: 10 | |
class: MyClass1 | |
function: my_function1 | |
full_path2/file2.py | |
line: 76 | |
function: MyClass2.my_function2 | |
full_path3/file3.py | |
line: 24 | |
line: 156 | |
function: my_function3 | |
``` | |
Return just the location(s) | |
Note: Your thinking should be thorough and so it's fine if it's very long. | |
""" | |
instruction += ( | |
'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n' | |
"Don't include any lambda functions!\n" | |
'You should NOT modify any files!\n' | |
) | |
if RUN_WITH_BROWSING: | |
instruction += """ | |
<IMPORTANT!> | |
You SHOULD NEVER attempt to browse the web. | |
</IMPORTANT!> | |
""" | |
return instruction | |
# TODO: migrate all swe-bench docker to ghcr.io/openhands | |
DEFAULT_DOCKER_IMAGE_PREFIX = os.environ.get( | |
'EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xingyaoww/' | |
) | |
logger.info(f'Default docker image prefix: {DEFAULT_DOCKER_IMAGE_PREFIX}') | |
def get_instance_docker_image(instance_id: str, official_image: bool = False) -> str: | |
if official_image: | |
# Official SWE-Bench image | |
# swebench/sweb.eval.x86_64.django_1776_django-11333:v1 | |
docker_image_prefix = 'docker.io/swebench/' | |
repo, name = instance_id.split('__') | |
image_name = f'sweb.eval.x86_64.{repo}_1776_{name}:latest' | |
logger.warning(f'Using official SWE-Bench image: {image_name}') | |
else: | |
# OpenHands version of the image | |
docker_image_prefix = DEFAULT_DOCKER_IMAGE_PREFIX | |
image_name = 'sweb.eval.x86_64.' + instance_id | |
image_name = image_name.replace( | |
'__', '_s_' | |
) # to comply with docker image naming convention | |
return (docker_image_prefix.rstrip('/') + '/' + image_name).lower() | |
def get_config( | |
instance: pd.Series, | |
metadata: EvalMetadata, | |
) -> OpenHandsConfig: | |
# We use a different instance image for the each instance of swe-bench eval | |
use_official_image = bool( | |
'verified' in metadata.dataset.lower() or 'lite' in metadata.dataset.lower() | |
) | |
base_container_image = get_instance_docker_image( | |
instance['instance_id'], use_official_image | |
) | |
logger.info( | |
f'Using instance container image: {base_container_image}. ' | |
f'Please make sure this image exists. ' | |
f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.' | |
) | |
sandbox_config = get_default_sandbox_config_for_eval() | |
sandbox_config.base_container_image = base_container_image | |
sandbox_config.enable_auto_lint = True | |
sandbox_config.use_host_network = False | |
# Add platform to the sandbox config to solve issue 4401 | |
sandbox_config.platform = 'linux/amd64' | |
sandbox_config.remote_runtime_resource_factor = get_instance_resource_factor( | |
dataset_name=metadata.dataset, | |
instance_id=instance['instance_id'], | |
) | |
workspace_dir_name = _get_swebench_workspace_dir_name(instance) | |
sandbox_config.runtime_startup_env_vars = { | |
'REPO_PATH': f'/workspace/{workspace_dir_name}/', | |
} | |
config = OpenHandsConfig( | |
default_agent=metadata.agent_class, | |
run_as_openhands=False, | |
max_iterations=metadata.max_iterations, | |
runtime=os.environ.get('RUNTIME', 'docker'), | |
sandbox=sandbox_config, | |
# do not mount workspace | |
workspace_base=None, | |
workspace_mount_path=None, | |
) | |
config.set_llm_config( | |
update_llm_config_for_completions_logging( | |
metadata.llm_config, metadata.eval_output_dir, instance['instance_id'] | |
) | |
) | |
agent_config = AgentConfig( | |
enable_jupyter=False, | |
enable_browsing=RUN_WITH_BROWSING, | |
enable_llm_editor=False, | |
condenser=metadata.condenser_config, | |
enable_prompt_extensions=False, | |
) | |
config.set_agent_config(agent_config) | |
return config | |
def initialize_runtime( | |
runtime: Runtime, | |
instance: pd.Series, # this argument is not required | |
): | |
"""Initialize the runtime for the agent. | |
This function is called before the runtime is used to run the agent. | |
""" | |
logger.info('-' * 30) | |
logger.info('BEGIN Runtime Initialization Fn') | |
logger.info('-' * 30) | |
workspace_dir_name = _get_swebench_workspace_dir_name(instance) | |
obs: CmdOutputObservation | |
# Set instance id | |
action = CmdRunAction( | |
command=f"""echo 'export SWE_INSTANCE_ID={instance['instance_id']}' >> ~/.bashrc && echo 'export PIP_CACHE_DIR=~/.cache/pip' >> ~/.bashrc && echo "alias git='git --no-pager'" >> ~/.bashrc""" | |
) | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
obs.exit_code == 0, f'Failed to export SWE_INSTANCE_ID: {str(obs)}' | |
) | |
action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """) | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise(obs.exit_code == 0, f'Failed to export USER: {str(obs)}') | |
# inject the init script | |
script_dir = os.path.dirname(__file__) | |
# inject the instance info | |
action = CmdRunAction(command='mkdir -p /swe_util/eval_data/instances') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
obs.exit_code == 0, | |
f'Failed to create /swe_util/eval_data/instances: {str(obs)}', | |
) | |
swe_instance_json_name = 'swe-bench-instance.json' | |
with tempfile.TemporaryDirectory() as temp_dir: | |
# Construct the full path for the desired file name within the temporary directory | |
temp_file_path = os.path.join(temp_dir, swe_instance_json_name) | |
# Write to the file with the desired name within the temporary directory | |
with open(temp_file_path, 'w') as f: | |
if not isinstance(instance, dict): | |
json.dump([instance.to_dict()], f) | |
else: | |
json.dump([instance], f) | |
# Copy the file to the desired location | |
runtime.copy_to(temp_file_path, '/swe_util/eval_data/instances/') | |
# inject the instance swe entry | |
runtime.copy_to( | |
str(os.path.join(script_dir, 'scripts/setup/instance_swe_entry.sh')), | |
'/swe_util/', | |
) | |
action = CmdRunAction(command='cat ~/.bashrc') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise(obs.exit_code == 0, f'Failed to cat ~/.bashrc: {str(obs)}') | |
action = CmdRunAction(command='source ~/.bashrc') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
if isinstance(obs, ErrorObservation): | |
logger.error(f'Failed to source ~/.bashrc: {str(obs)}') | |
assert_and_raise(obs.exit_code == 0, f'Failed to source ~/.bashrc: {str(obs)}') | |
action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
obs.exit_code == 0, | |
f'Failed to source /swe_util/instance_swe_entry.sh: {str(obs)}', | |
) | |
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
obs.exit_code == 0, | |
f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}', | |
) | |
action = CmdRunAction(command='git reset --hard') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise(obs.exit_code == 0, f'Failed to git reset --hard: {str(obs)}') | |
action = CmdRunAction( | |
command='for remote_name in $(git remote); do git remote remove "${remote_name}"; done' | |
) | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}') | |
# Copy the processed indexes if available | |
action = CmdRunAction(command='mkdir _index_data/graph_index_v2.3') | |
obs = runtime.run_action(action) | |
# Check if an existing graph index file is available | |
graph_index_file_path = os.path.join( | |
INDEX_BASE_DIR, 'graph_index_v2.3', f'{instance["instance_id"]}.pkl' | |
) | |
if INDEX_BASE_DIR and os.path.exists(graph_index_file_path): | |
logger.info( | |
f'Copying graph index from {graph_index_file_path} to /workspace/{workspace_dir_name}/_index_data/graph_index_v2.3' | |
) | |
runtime.copy_to( | |
graph_index_file_path, | |
f'/workspace/{workspace_dir_name}/_index_data/graph_index_v2.3', | |
) | |
action = CmdRunAction( | |
command=f'mv _index_data/graph_index_v2.3/{instance["instance_id"]}.pkl _index_data/graph_index_v2.3/code_graph.pkl' | |
) | |
obs = runtime.run_action(action) | |
bm25_index_dir = os.path.join( | |
INDEX_BASE_DIR, 'BM25_index', instance['instance_id'] | |
) | |
runtime.copy_to( | |
bm25_index_dir, | |
f'/workspace/{workspace_dir_name}/_index_data', | |
recursive=True, | |
) | |
action = CmdRunAction( | |
command=f'mv _index_data/{instance["instance_id"]} _index_data/bm25_index' | |
) | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise(obs.exit_code == 0, f'Failed to mv file: {str(obs)}') | |
action = CmdRunAction(command='which python') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
obs.exit_code == 0 and 'testbed' in obs.content, | |
f'Expected to find python interpreter from testbed, but got: {str(obs)}', | |
) | |
logger.info('-' * 30) | |
logger.info('END Runtime Initialization Fn') | |
logger.info('-' * 30) | |
def complete_runtime( | |
runtime: Runtime, | |
instance: pd.Series, # this argument is not required, but it is used to get the workspace_dir_name | |
) -> dict[str, Any]: | |
"""Complete the runtime for the agent. | |
This function is called before the runtime is used to run the agent. | |
If you need to do something in the sandbox to get the correctness metric after | |
the agent has run, modify this function. | |
""" | |
logger.info('-' * 30) | |
logger.info('BEGIN Runtime Completion Fn') | |
logger.info('-' * 30) | |
obs: CmdOutputObservation | |
workspace_dir_name = _get_swebench_workspace_dir_name(instance) | |
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
if obs.exit_code == -1: | |
# The previous command is still running | |
# We need to kill previous command | |
logger.info('The previous command is still running, trying to kill it...') | |
action = CmdRunAction(command='C-c') | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
# Then run the command again | |
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, | |
f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}', | |
) | |
action = CmdRunAction(command='git config --global core.pager ""') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, | |
f'Failed to git config --global core.pager "": {str(obs)}', | |
) | |
# First check for any git repositories in subdirectories | |
action = CmdRunAction(command='find . -type d -name .git -not -path "./.git"') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, | |
f'Failed to find git repositories: {str(obs)}', | |
) | |
git_dirs = [p for p in obs.content.strip().split('\n') if p] | |
if git_dirs: | |
# Remove all .git directories in subdirectories | |
for git_dir in git_dirs: | |
action = CmdRunAction(command=f'rm -rf "{git_dir}"') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, | |
f'Failed to remove git directory {git_dir}: {str(obs)}', | |
) | |
# add all files | |
action = CmdRunAction(command='git add -A') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, | |
f'Failed to git add -A: {str(obs)}', | |
) | |
n_retries = 0 | |
git_patch = None | |
while n_retries < 5: | |
action = CmdRunAction( | |
command=f'git diff --no-color --cached {instance["base_commit"]}' | |
) | |
action.set_hard_timeout(max(300 + 100 * n_retries, 600)) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
n_retries += 1 | |
if isinstance(obs, CmdOutputObservation): | |
if obs.exit_code == 0: | |
git_patch = obs.content.strip() | |
break | |
else: | |
logger.info('Failed to get git diff, retrying...') | |
sleep_if_should_continue(10) | |
elif isinstance(obs, ErrorObservation): | |
logger.error(f'Error occurred: {obs.content}. Retrying...') | |
sleep_if_should_continue(10) | |
else: | |
assert_and_raise(False, f'Unexpected observation type: {str(obs)}') | |
assert_and_raise(git_patch is not None, 'Failed to get git diff (None)') | |
logger.info('-' * 30) | |
logger.info('END Runtime Completion Fn') | |
logger.info('-' * 30) | |
return {'git_patch': git_patch} | |
def process_instance( | |
instance: pd.Series, | |
metadata: EvalMetadata, | |
reset_logger: bool = True, | |
runtime_failure_count: int = 0, | |
) -> EvalOutput: | |
config = get_config(instance, metadata) | |
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation | |
if reset_logger: | |
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs') | |
reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir) | |
else: | |
logger.info(f'Starting evaluation for instance {instance.instance_id}.') | |
# Increase resource_factor with increasing attempt_id | |
if runtime_failure_count > 0: | |
config.sandbox.remote_runtime_resource_factor = min( | |
config.sandbox.remote_runtime_resource_factor * (2**runtime_failure_count), | |
8, | |
) | |
logger.warning( | |
f'This is the {runtime_failure_count + 1}th attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}' | |
) | |
runtime = create_runtime(config) | |
call_async_from_sync(runtime.connect) | |
try: | |
initialize_runtime(runtime, instance) | |
instruction = get_instruction(instance, metadata) | |
# Here's how you can run the agent (similar to the `main` function) and get the final task state | |
state: State | None = asyncio.run( | |
run_controller( | |
config=config, | |
initial_user_action=MessageAction(content=instruction), | |
runtime=runtime, | |
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[ | |
metadata.agent_class | |
], | |
) | |
) | |
# if fatal error, throw EvalError to trigger re-run | |
if is_fatal_evaluation_error(state.last_error): | |
raise EvalException('Fatal error detected: ' + state.last_error) | |
# ======= THIS IS SWE-Bench specific ======= | |
# Get git patch | |
return_val = complete_runtime(runtime, instance) | |
git_patch = return_val['git_patch'] | |
logger.info( | |
f'Got git diff for instance {instance.instance_id}:\n--------\n{git_patch}\n--------' | |
) | |
finally: | |
runtime.close() | |
# ========================================== | |
# ======= Attempt to evaluate the agent's edits ======= | |
# we use eval_infer.sh to evaluate the agent's edits, not here | |
# because the agent may alter the environment / testcases | |
test_result = { | |
'git_patch': git_patch, | |
} | |
# If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction) | |
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation. | |
if state is None: | |
raise ValueError('State should not be None.') | |
# NOTE: this is NO LONGER the event stream, but an agent history that includes delegate agent's events | |
histories = [event_to_dict(event) for event in state.history] | |
metrics = get_metrics(state) | |
# Save the output | |
output = EvalOutput( | |
instance_id=instance.instance_id, | |
instruction=instruction, | |
instance=instance.to_dict(), # SWE Bench specific | |
test_result=test_result, | |
metadata=metadata, | |
history=histories, | |
metrics=metrics, | |
error=state.last_error if state and state.last_error else None, | |
) | |
return output | |
def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame: | |
file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.toml') | |
if os.path.exists(file_path): | |
with open(file_path, 'r') as file: | |
data = toml.load(file) | |
if 'selected_ids' in data: | |
selected_ids = data['selected_ids'] | |
logger.info( | |
f'Filtering {len(selected_ids)} tasks from "selected_ids"...' | |
) | |
subset = dataset[dataset[filter_column].isin(selected_ids)] | |
logger.info(f'Retained {subset.shape[0]} tasks after filtering') | |
return subset | |
skip_ids = os.environ.get('SKIP_IDS', '').split(',') | |
if len(skip_ids) > 0: | |
logger.info(f'Filtering {len(skip_ids)} tasks from "SKIP_IDS"...') | |
return dataset[~dataset[filter_column].isin(skip_ids)] | |
return dataset | |
# A list of instances that are known to be tricky to infer | |
# (will cause runtime failure even with resource factor = 8) | |
SWEGYM_EXCLUDE_IDS = [ | |
'dask__dask-10422', | |
'pandas-dev__pandas-50548', | |
'pandas-dev__pandas-53672', | |
'pandas-dev__pandas-54174', | |
'pandas-dev__pandas-55518', | |
'pandas-dev__pandas-58383', | |
'pydata__xarray-6721', | |
'pytest-dev__pytest-10081', | |
'pytest-dev__pytest-7236', | |
] | |
if __name__ == '__main__': | |
parser = get_parser() | |
parser.add_argument( | |
'--dataset', | |
type=str, | |
default='princeton-nlp/SWE-bench', | |
help='data set to evaluate on, either full-test or lite-test', | |
) | |
parser.add_argument( | |
'--split', | |
type=str, | |
default='test', | |
help='split to evaluate on', | |
) | |
args, _ = parser.parse_known_args() | |
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing | |
# so we don't need to manage file uploading to OpenHands's repo | |
dataset = load_dataset(args.dataset, split=args.split) | |
swe_bench_tests = filter_dataset(dataset.to_pandas(), 'instance_id') | |
logger.info( | |
f'Loaded dataset {args.dataset} with split {args.split}: {len(swe_bench_tests)} tasks' | |
) | |
if 'SWE-Gym' in args.dataset: | |
swe_bench_tests = swe_bench_tests[ | |
~swe_bench_tests['instance_id'].isin(SWEGYM_EXCLUDE_IDS) | |
] | |
logger.info( | |
f'{len(swe_bench_tests)} tasks left after excluding SWE-Gym excluded tasks' | |
) | |
llm_config = None | |
if args.llm_config: | |
llm_config = get_llm_config_arg(args.llm_config) | |
llm_config.log_completions = True | |
# modify_params must be False for evaluation purpose, for reproducibility and accurancy of results | |
llm_config.modify_params = False | |
if llm_config is None: | |
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') | |
details = {} | |
_agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls) | |
dataset_descrption = ( | |
args.dataset.replace('/', '__') + '-' + args.split.replace('/', '__') | |
) | |
metadata = make_metadata( | |
llm_config, | |
dataset_descrption, | |
args.agent_cls, | |
args.max_iterations, | |
args.eval_note, | |
args.eval_output_dir, | |
details=details, | |
) | |
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') | |
print(f'### OUTPUT FILE: {output_file} ###') | |
instances = prepare_dataset(swe_bench_tests, output_file, args.eval_n_limit) | |
if len(instances) > 0 and not isinstance( | |
instances['PASS_TO_PASS'][instances['PASS_TO_PASS'].index[0]], str | |
): | |
for col in ['PASS_TO_PASS', 'FAIL_TO_PASS']: | |
instances[col] = instances[col].apply(lambda x: str(x)) | |
run_evaluation( | |
instances, | |
metadata, | |
output_file, | |
args.eval_num_workers, | |
process_instance, | |
timeout_seconds=8 * 60 * 60, # 8 hour PER instance should be more than enough | |
max_retries=5, | |
) | |