Spaces:
Build error
Build error
import asyncio | |
import json | |
import os | |
from collections import Counter | |
from typing import Any | |
import pandas as pd | |
from commit0.harness.constants import SPLIT | |
from datasets import load_dataset | |
import openhands.agenthub | |
from evaluation.utils.shared import ( | |
EvalException, | |
EvalMetadata, | |
EvalOutput, | |
assert_and_raise, | |
codeact_user_response, | |
get_default_sandbox_config_for_eval, | |
make_metadata, | |
prepare_dataset, | |
reset_logger_for_multiprocessing, | |
run_evaluation, | |
update_llm_config_for_completions_logging, | |
) | |
from openhands.controller.state.state import State | |
from openhands.core.config import ( | |
AgentConfig, | |
OpenHandsConfig, | |
get_llm_config_arg, | |
get_parser, | |
) | |
from openhands.core.logger import openhands_logger as logger | |
from openhands.core.main import create_runtime, run_controller | |
from openhands.events.action import CmdRunAction, MessageAction | |
from openhands.events.observation import CmdOutputObservation, ErrorObservation | |
from openhands.events.serialization.event import event_to_dict | |
from openhands.runtime.base import Runtime | |
from openhands.utils.async_utils import call_async_from_sync | |
from openhands.utils.shutdown_listener import sleep_if_should_continue | |
USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true' | |
RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true' | |
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = { | |
'CodeActAgent': codeact_user_response, | |
'CodeActCommit0Agent': codeact_user_response, | |
} | |
def _get_commit0_workspace_dir_name(instance: pd.Series) -> str: | |
return instance['repo'].split('/')[1] | |
def get_instruction(instance: pd.Series, metadata: EvalMetadata): | |
workspace_dir_name = _get_commit0_workspace_dir_name(instance) | |
# Prepare instruction | |
test_cmd = instance['test']['test_cmd'] | |
test_dir = instance['test']['test_dir'] | |
# Instruction based on Anthropic's official trajectory | |
# https://github.com/eschluntz/swe-bench-experiments/tree/main/evaluation/verified/20241022_tools_claude-3-5-sonnet-updated/trajs | |
instruction = ( | |
'<uploaded_files>\n' | |
f'/workspace/{workspace_dir_name}\n' | |
'</uploaded_files>\n' | |
f"I've uploaded a python code repository in the directory {workspace_dir_name}. Here is your task:\n\n" | |
'Here is your task:\n\n' | |
' You need to complete the implementations for all functions (i.e., those with pass\n' | |
' statements) and pass the unit tests.\n\n' | |
' Do not change the names of existing functions or classes, as they may be referenced\n' | |
' from other code like unit tests, etc.\n\n' | |
' When you generate code, you must maintain the original formatting of the function\n' | |
' stubs (such as whitespaces), otherwise we will not able to search/replace blocks\n' | |
' for code modifications, and therefore you will receive a score of 0 for your generated\n' | |
' code.' | |
'\n\n' | |
'Here is the command to run the unit tests:\n' | |
'<test_command>\n' | |
f'{test_cmd} {test_dir}\n' | |
'</test_command>\n\n' | |
'Make a local git commit for each agent step for all code changes. If there is not change in current step, do not make a commit.' | |
) | |
if RUN_WITH_BROWSING: | |
instruction += ( | |
'<IMPORTANT!>\nYou SHOULD NEVER attempt to browse the web. </IMPORTANT!>\n' | |
) | |
return instruction | |
# TODO: migrate all swe-bench docker to ghcr.io/openhands | |
DOCKER_IMAGE_PREFIX = os.environ.get( | |
'EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/wentingzhao/' | |
) | |
logger.info(f'Using docker image prefix: {DOCKER_IMAGE_PREFIX}') | |
def get_instance_docker_image(repo_name: str) -> str: | |
return (DOCKER_IMAGE_PREFIX.rstrip('/') + '/' + repo_name).lower() + ':v0' | |
def get_config( | |
instance: pd.Series, | |
metadata: EvalMetadata, | |
) -> OpenHandsConfig: | |
repo_name = instance['repo'].split('/')[1] | |
base_container_image = get_instance_docker_image(repo_name) | |
logger.info( | |
f'Using instance container image: {base_container_image}. ' | |
f'Please make sure this image exists. ' | |
f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.' | |
) | |
sandbox_config = get_default_sandbox_config_for_eval() | |
sandbox_config.base_container_image = base_container_image | |
config = OpenHandsConfig( | |
default_agent=metadata.agent_class, | |
run_as_openhands=False, | |
max_iterations=metadata.max_iterations, | |
runtime=os.environ.get('RUNTIME', 'docker'), | |
sandbox=sandbox_config, | |
# do not mount workspace | |
workspace_base=None, | |
workspace_mount_path=None, | |
) | |
config.set_llm_config( | |
update_llm_config_for_completions_logging( | |
metadata.llm_config, metadata.eval_output_dir, instance['instance_id'] | |
) | |
) | |
agent_config = AgentConfig( | |
enable_jupyter=False, | |
enable_browsing=RUN_WITH_BROWSING, | |
enable_llm_editor=False, | |
) | |
config.set_agent_config(agent_config) | |
return config | |
def initialize_runtime( | |
runtime: Runtime, | |
instance: pd.Series, # this argument is not required | |
): | |
"""Initialize the runtime for the agent. | |
This function is called before the runtime is used to run the agent. | |
""" | |
logger.info('-' * 30) | |
logger.info('BEGIN Runtime Initialization Fn') | |
logger.info('-' * 30) | |
workspace_dir_name = _get_commit0_workspace_dir_name(instance) | |
obs: CmdOutputObservation | |
action = CmdRunAction( | |
command=f'git clone -b commit0_combined https://github.com/{instance["repo"]}.git' | |
) | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
obs.exit_code == 0, | |
f'Failed to git clone -b commit0_combined https://github.com/{instance["repo"]}.git: {str(obs)}', | |
) | |
action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
obs.exit_code == 0, | |
f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}', | |
) | |
action = CmdRunAction(command='git checkout -b openhands') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
obs.exit_code == 0, f'Failed to git checkout new branch openhands: {str(obs)}' | |
) | |
# Install commit0 | |
action = CmdRunAction(command='/root/.cargo/bin/uv pip install commit0') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
# logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
obs.exit_code == 0, | |
f'Failed to install commit0: {str(obs)}', | |
) | |
logger.info('-' * 30) | |
logger.info('END Runtime Initialization Fn') | |
logger.info('-' * 30) | |
def complete_runtime( | |
runtime: Runtime, | |
instance: pd.Series, # this argument is not required, but it is used to get the workspace_dir_name | |
) -> dict[str, Any]: | |
"""Complete the runtime for the agent. | |
This function is called before the runtime is used to run the agent. | |
If you need to do something in the sandbox to get the correctness metric after | |
the agent has run, modify this function. | |
""" | |
logger.info('-' * 30) | |
logger.info('BEGIN Runtime Completion Fn') | |
logger.info('-' * 30) | |
obs: CmdOutputObservation | |
workspace_dir_name = _get_commit0_workspace_dir_name(instance) | |
action = CmdRunAction(command='git add .') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, | |
f'Failed to git add -A: {str(obs)}', | |
) | |
action = CmdRunAction(command='git commit -m "openhands edits"') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
isinstance(obs, CmdOutputObservation) | |
and (obs.exit_code == 0 or obs.exit_code == 1), | |
f'Failed to git commit -m "openhands": {str(obs)}', | |
) | |
# Generate diff patch compared to base commit, excluding spec.pdf.bz2 files | |
n_retries = 0 | |
git_patch = None | |
while n_retries < 5: | |
action = CmdRunAction( | |
command=f"git diff {instance['base_commit']} HEAD -- . ':(exclude)spec.pdf.bz2'" | |
) | |
action.set_hard_timeout(600 + 100 * n_retries) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
# logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
n_retries += 1 | |
if isinstance(obs, CmdOutputObservation): | |
if obs.exit_code == 0: | |
git_patch = obs.content.strip() | |
break | |
else: | |
logger.info('Failed to get git diff, retrying...') | |
sleep_if_should_continue(10) | |
elif isinstance(obs, ErrorObservation): | |
logger.error(f'Error occurred: {obs.content}. Retrying...') | |
sleep_if_should_continue(10) | |
else: | |
assert_and_raise(False, f'Unexpected observation type: {str(obs)}') | |
assert_and_raise(git_patch is not None, 'Failed to get git diff (None)') | |
test_dir = instance['test']['test_dir'] | |
action = CmdRunAction( | |
command=f'{instance["test"]["test_cmd"]} --json-report --json-report-file=report.json --continue-on-collection-errors {test_dir} > test_output.txt 2>&1' | |
) | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
isinstance(obs, CmdOutputObservation), | |
f'Failed to run test command: {str(obs)}', | |
) | |
# Read test output | |
action = CmdRunAction(command='cat test_output.txt') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
# logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
isinstance(obs, CmdOutputObservation), | |
f'Failed to read test output: {str(obs)}', | |
) | |
test_output = obs.content.strip() | |
# logger.info(f'Test output: {test_output}') | |
# Save pytest exit code | |
action = CmdRunAction(command='echo $?') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
# logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
isinstance(obs, CmdOutputObservation) and obs.exit_code == 0, | |
f'Failed to save pytest exit code: {str(obs)}', | |
) | |
pytest_exit_code = obs.content.strip() | |
# logger.info(f'Pytest exit code: {pytest_exit_code}') | |
# Get test IDs from instance | |
repo_name = instance['repo'].split('/')[1] | |
repo_name = repo_name.replace('.', '-') | |
action = CmdRunAction(command=f'commit0 get-tests {repo_name}') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
# logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
test_ids = obs.content.strip().split('\n') | |
# Read the test report | |
action = CmdRunAction(command='cat report.json') | |
action.set_hard_timeout(600) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
# logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert_and_raise( | |
isinstance(obs, CmdOutputObservation), | |
f'Failed to read test report: {str(obs)}', | |
) | |
json_report = obs.content.strip() | |
try: | |
report = json.loads(json_report) | |
tests = {x['nodeid']: x['call'] for x in report['tests'] if 'call' in x} | |
# Calculate test statistics | |
status = [] | |
runtimes = [] | |
no_runs = 0 | |
for test_id in test_ids: | |
if test_id in tests and tests[test_id] is not None: | |
status.append(tests[test_id]['outcome']) | |
runtimes.append(tests[test_id]['duration']) | |
no_runs += 1 | |
else: | |
status.append('failed') | |
runtimes.append(0) | |
status_counts = Counter(status) | |
total_runtime = sum(runtimes) if no_runs > 0 else 0 | |
num_passed = status_counts.get('passed', 0) + status_counts.get('xfail', 0) | |
passed_ratio = num_passed / len(status) if status else 0 | |
eval_result = { | |
'name': workspace_dir_name, | |
'sum': total_runtime, | |
'passed': passed_ratio, | |
'num_passed': num_passed, | |
'num_tests': len(test_ids), | |
} | |
except json.JSONDecodeError: | |
logger.error('Failed to parse test report JSON') | |
eval_result = { | |
'name': workspace_dir_name, | |
'sum': 0, | |
'passed': 0, | |
'num_passed': 0, | |
'num_tests': len(test_ids), | |
} | |
# Create tarball of workspace | |
temp_zip = runtime.copy_from(f'/workspace/{workspace_dir_name}') | |
commit0_dir = os.path.dirname(__file__) | |
persistent_zip = os.path.join(commit0_dir, f'{workspace_dir_name}.zip') | |
with open(temp_zip, 'rb') as src, open(persistent_zip, 'wb') as dst: | |
dst.write(src.read()) | |
zip_file = persistent_zip | |
return { | |
'eval_result': eval_result, | |
'git_patch': git_patch, | |
'test_output': test_output, | |
'pytest_exit_code': pytest_exit_code, | |
'zip_file': zip_file, | |
} | |
def process_instance( | |
instance: pd.Series, | |
metadata: EvalMetadata, | |
reset_logger: bool = True, | |
) -> EvalOutput: | |
config = get_config(instance, metadata) | |
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation | |
if reset_logger: | |
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs') | |
reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir) | |
else: | |
logger.info(f'Starting evaluation for instance {instance.instance_id}.') | |
runtime = create_runtime(config) | |
call_async_from_sync(runtime.connect) | |
try: | |
initialize_runtime(runtime, instance) | |
instruction = get_instruction(instance, metadata) | |
# Here's how you can run the agent (similar to the `main` function) and get the final task state | |
state: State | None = asyncio.run( | |
run_controller( | |
config=config, | |
initial_user_action=MessageAction(content=instruction), | |
runtime=runtime, | |
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[ | |
metadata.agent_class | |
], | |
) | |
) | |
# if fatal error, throw EvalError to trigger re-run | |
if ( | |
state.last_error | |
and 'fatal error during agent execution' in state.last_error | |
and 'stuck in a loop' not in state.last_error | |
): | |
raise EvalException('Fatal error detected: ' + state.last_error) | |
# ======= THIS IS Commit0 specific ======= | |
# Get git patch | |
return_val = complete_runtime(runtime, instance) | |
eval_result = return_val['eval_result'] | |
git_patch = return_val['git_patch'] | |
test_output = return_val['test_output'] | |
pytest_exit_code = return_val['pytest_exit_code'] | |
zip_file = return_val['zip_file'] | |
repo_name = instance['repo'].split('/')[1] | |
zip_dest = os.path.join( | |
metadata.eval_output_dir, 'repos', repo_name, f'{repo_name}.zip' | |
) | |
patch_file = os.path.join( | |
metadata.eval_output_dir, 'repos', repo_name, f'{repo_name}_patch.diff' | |
) | |
test_output_file = os.path.join( | |
metadata.eval_output_dir, 'repos', repo_name, f'{repo_name}_test_output.txt' | |
) | |
pytest_exit_code_file = os.path.join( | |
metadata.eval_output_dir, | |
'repos', | |
repo_name, | |
f'{repo_name}_pytest_exit_code.txt', | |
) | |
os.makedirs(os.path.dirname(zip_dest), exist_ok=True) | |
os.rename(zip_file, zip_dest) | |
write_targets = [ | |
(patch_file, git_patch), | |
(test_output_file, test_output), | |
(pytest_exit_code_file, pytest_exit_code), | |
] | |
for write_target in write_targets: | |
with open(write_target[0], 'w') as f: | |
f.write(write_target[1]) | |
logger.info( | |
f'Got evaluation result for repo {instance.instance_id}:\n--------\n{eval_result}\n--------' | |
) | |
finally: | |
runtime.close() | |
# ========================================== | |
# ======= Attempt to evaluate the agent's edits ======= | |
# we use eval_infer.sh to evaluate the agent's edits, not here | |
# because the agent may alter the environment / testcases | |
test_result = { | |
'eval_result': eval_result, | |
} | |
# If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction) | |
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation. | |
if state is None: | |
raise ValueError('State should not be None.') | |
# NOTE: this is NO LONGER the event stream, but an agent history that includes delegate agent's events | |
histories = [event_to_dict(event) for event in state.history] | |
metrics = state.metrics.get() if state.metrics else None | |
# Save the output | |
output = EvalOutput( | |
instance_id=instance.instance_id, | |
instruction=instruction, | |
instance=instance.to_dict(), | |
test_result=test_result, | |
metadata=metadata, | |
history=histories, | |
metrics=metrics, | |
error=state.last_error if state and state.last_error else None, | |
) | |
return output | |
def commit0_setup(dataset: pd.DataFrame, repo_split: str) -> pd.DataFrame: | |
"""Setup Commit0 dataset based on split type. | |
Args: | |
dataset: Full Commit0 dataset | |
repo_split: Split type ('all', 'lite' or specific repo name) | |
Returns: | |
Filtered dataset based on split type | |
""" | |
filtered_dataset = pd.concat( | |
[ | |
dataset[dataset['repo'].str.split('/').str[1] == repo] | |
for repo in SPLIT.get(repo_split, []) | |
] | |
) | |
# Drop setup column if it exists | |
if 'setup' in filtered_dataset.columns: | |
filtered_dataset = filtered_dataset.drop('setup', axis=1) | |
# Replace all forward slashes in instance_id with hyphens | |
filtered_dataset['instance_id'] = filtered_dataset['repo'].str.split('/').str[1] | |
return filtered_dataset | |
if __name__ == '__main__': | |
parser = get_parser() | |
parser.add_argument( | |
'--dataset', | |
type=str, | |
default='wentingzhao/commit0_combined', | |
help='dataset to evaluate on, only test split exists for this HF dataset', | |
) | |
parser.add_argument( | |
'--split', | |
type=str, | |
default='test', | |
help='this is the HF dataset split', | |
) | |
parser.add_argument( | |
'--repo-split', | |
type=str, | |
default='lite', | |
help='all, lite, or each repo name', | |
) | |
args, _ = parser.parse_known_args() | |
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing | |
# so we don't need to manage file uploading to OpenHands's repo | |
dataset = load_dataset(args.dataset, split=args.split) | |
commit0_datasets = commit0_setup(dataset.to_pandas(), args.repo_split) | |
logger.info(f'Loaded dataset {args.dataset} with reposplit {args.repo_split}') | |
llm_config = None | |
if args.llm_config: | |
llm_config = get_llm_config_arg(args.llm_config) | |
# modify_params must be False for evaluation purpose, for reproducibility and accurancy of results | |
llm_config.modify_params = False | |
llm_config.log_completions = True | |
if llm_config is None: | |
raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') | |
details = {} | |
_agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls) | |
dataset_descrption = ( | |
args.dataset.replace('/', '__') + '-' + args.repo_split.replace('/', '__') | |
) | |
metadata = make_metadata( | |
llm_config, | |
dataset_descrption, | |
args.agent_cls, | |
args.max_iterations, | |
args.eval_note, | |
args.eval_output_dir, | |
details=details, | |
) | |
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') | |
instances = prepare_dataset(commit0_datasets, output_file, args.eval_n_limit) | |
run_evaluation( | |
instances, | |
metadata, | |
output_file, | |
args.eval_num_workers, | |
process_instance, | |
timeout_seconds=120 * 60, # 2 hour PER instance should be more than enough | |
) | |