Spaces:
Build error
Build error
import os | |
import tempfile | |
import time | |
from functools import partial | |
import pandas as pd | |
from report_utils import ( | |
check_coverage, | |
check_mutation, | |
count_methods, | |
get_lines_of_code, | |
) | |
from evaluation.benchmarks.testgeneval.compute_readability import compute_readability | |
from evaluation.benchmarks.testgeneval.constants import ( | |
COVERAGE_PREFIX, | |
MUTATION_BUFFER, | |
MUTATION_TEMPLATE, | |
MUTATION_TIMEOUT, | |
TESTS_SUFFIX, | |
) | |
from evaluation.benchmarks.testgeneval.metrics import ( | |
bleu, | |
edit_sim, | |
exact_match, | |
rouge_l, | |
) | |
from evaluation.benchmarks.testgeneval.pygments_utils import tokenize_code | |
from evaluation.benchmarks.testgeneval.run_infer import get_instance_docker_image | |
from evaluation.benchmarks.testgeneval.test_filter import filter_tests | |
from evaluation.benchmarks.testgeneval.test_spec import ( | |
TestGenEvalInstance, | |
TestSpec, | |
make_test_spec, | |
) | |
from evaluation.benchmarks.testgeneval.utils import load_testgeneval_dataset | |
from evaluation.utils.shared import ( | |
EvalMetadata, | |
EvalOutput, | |
prepare_dataset, | |
reset_logger_for_multiprocessing, | |
run_evaluation, | |
) | |
from openhands.core.config import OpenHandsConfig, SandboxConfig, get_parser | |
from openhands.core.logger import openhands_logger as logger | |
from openhands.core.main import create_runtime | |
from openhands.events.action import CmdRunAction | |
from openhands.events.observation import CmdOutputObservation | |
from openhands.utils.async_utils import call_async_from_sync | |
DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/kdjain/') | |
logger.info(f'Using docker image prefix: {DOCKER_IMAGE_PREFIX}') | |
def get_config(instance: pd.Series) -> OpenHandsConfig: | |
base_container_image = get_instance_docker_image(instance['instance_id_swebench']) | |
assert base_container_image, ( | |
f'Invalid container image for instance {instance["instance_id_swebench"]}.' | |
) | |
logger.info(f'Using instance container image: {base_container_image}.') | |
return OpenHandsConfig( | |
run_as_openhands=False, | |
runtime=os.environ.get('RUNTIME', 'eventstream'), | |
sandbox=SandboxConfig( | |
base_container_image=base_container_image, | |
use_host_network=False, | |
timeout=1800, | |
api_key=os.environ.get('ALLHANDS_API_KEY'), | |
remote_runtime_api_url=os.environ.get( | |
'SANDBOX_REMOTE_RUNTIME_API_URL', 'http://localhost:8000' | |
), | |
), | |
workspace_base=None, | |
workspace_mount_path=None, | |
) | |
def compute_lexical_metrics(pred_suite, gold_suite): | |
pred_loc = get_lines_of_code(pred_suite) | |
gold_loc = get_lines_of_code(gold_suite) | |
pred_methods = count_methods(pred_suite) | |
gold_methods = count_methods(gold_suite) | |
readability_pred = compute_readability(pred_suite) | |
readability_gold = compute_readability(gold_suite) | |
preds = tokenize_code(pred_suite) | |
golds = tokenize_code(gold_suite) | |
return { | |
'pred_loc': pred_loc, | |
'gold_loc': gold_loc, | |
'pred_readability': readability_pred, | |
'gold_readability': readability_gold, | |
'pred_methods': pred_methods, | |
'gold_methods': gold_methods, | |
'bleu': bleu(preds, golds), | |
'xmatch': exact_match(preds, golds), | |
'edit_sim': edit_sim(preds, golds), | |
'rouge_f': rouge_l(golds, preds)['f'], | |
'rouge_p': rouge_l(golds, preds)['p'], | |
'rouge_r': rouge_l(golds, preds)['r'], | |
} | |
def run_command(runtime, command, timeout=600): | |
action = CmdRunAction(command=command) | |
action.set_hard_timeout(timeout) | |
logger.info(action, extra={'msg_type': 'ACTION'}) | |
obs = runtime.run_action(action) | |
logger.info(obs, extra={'msg_type': 'OBSERVATION'}) | |
assert obs.exit_code == 0 | |
return obs | |
def run_tests(runtime, instance, test_script, log_file='/tmp/test_output.log'): | |
action = CmdRunAction(command=f'bash {test_script} > {log_file} 2>&1 & echo $!') | |
action.set_hard_timeout(60) | |
obs = runtime.run_action(action) | |
assert isinstance(obs, CmdOutputObservation), 'Failed to start test script.' | |
pid = obs.content.split()[-1].strip() | |
logger.info(f'[{instance.instance_id}] Test process started with PID: {pid}') | |
start_time = time.time() | |
timeout = 1800 | |
while True: | |
elapsed_time = time.time() - start_time | |
if elapsed_time > timeout: | |
logger.info(f'[{instance.instance_id}] Test process timed out.') | |
instance['test_result']['report']['test_timeout'] = True | |
break | |
check_action = CmdRunAction(command=f'ps -p {pid} > /dev/null; echo $?') | |
check_obs = runtime.run_action(check_action) | |
if ( | |
isinstance(check_obs, CmdOutputObservation) | |
and len(check_obs.content.split()) > 0 | |
and check_obs.content.split()[-1].strip() == '1' | |
): | |
logger.info(f'[{instance.instance_id}] Test process completed.') | |
break | |
time.sleep(30) | |
test_action = CmdRunAction(command=f'cat {log_file}') | |
test_action.set_hard_timeout(300) | |
test_obs = runtime.run_action(test_action) | |
assert isinstance(test_obs, CmdOutputObservation), 'Failed to retrieve test output.' | |
return test_obs.exit_code, test_obs.content, elapsed_time | |
def run_mutation_testing( | |
runtime, instance, mutation_script, log_file='/tmp/mutation_output.log' | |
): | |
action = CmdRunAction(command=f'bash {mutation_script} > {log_file} 2>&1 & echo $!') | |
action.set_hard_timeout(60) | |
obs = runtime.run_action(action) | |
assert isinstance(obs, CmdOutputObservation), 'Failed to start test script.' | |
pid = obs.content.split()[-1].strip() | |
logger.info(f'[{instance.instance_id}] Mutation process started with PID: {pid}') | |
start_time = time.time() | |
timeout = 4000 | |
while True: | |
elapsed_time = time.time() - start_time | |
if elapsed_time > timeout: | |
logger.info(f'[{instance.instance_id}] Mutation process timed out.') | |
instance['test_result']['report']['mutation_timeout'] = True | |
break | |
check_action = CmdRunAction(command=f'ps -p {pid} > /dev/null; echo $?') | |
check_obs = runtime.run_action(check_action) | |
if ( | |
isinstance(check_obs, CmdOutputObservation) | |
and len(check_obs.content.split()) > 0 | |
and check_obs.content.split()[-1].strip() == '1' | |
): | |
logger.info(f'[{instance.instance_id}] Mutation process completed.') | |
break | |
time.sleep(30) | |
assert isinstance(obs, CmdOutputObservation), 'Failed to run mutation script.' | |
mutation_action = CmdRunAction(command=f'cat {log_file}') | |
mutation_action.set_hard_timeout(300) | |
mutation_obs = runtime.run_action(mutation_action) | |
assert isinstance(mutation_obs, CmdOutputObservation), ( | |
'Failed to retrieve mutation output.' | |
) | |
return mutation_obs.exit_code, mutation_obs.content | |
def grade_test_output( | |
test_suite: str, instance: pd.Series, test_output: str, test_spec: TestSpec, runtime | |
): | |
""" | |
Two-pass test grading with short-circuiting: | |
1. Run all tests to identify passing/failing tests | |
2. If no failing tests, evaluate coverage immediately | |
3. Otherwise, run only passing tests for coverage analysis | |
""" | |
unit_test_output, coverage_output = '', '' | |
if TESTS_SUFFIX in test_output: | |
unit_test_output = test_output.split(TESTS_SUFFIX)[0] | |
if not unit_test_output: | |
return ( | |
False, | |
0, | |
'', | |
'', | |
{ | |
'total_tests': 0, | |
'passing_tests': 0, | |
'failing_tests': 0, | |
'any_pass': False, | |
'all_pass': False, | |
'passing_test_names': [], | |
'failing_test_names': [], | |
}, | |
) | |
logger.info('Calling filter unit tests') | |
filtered_content, passing_tests, failing_tests = filter_tests( | |
test_suite, unit_test_output, test_spec.repo | |
) | |
total_tests = len(passing_tests) + len(failing_tests) | |
test_stats = { | |
'total_tests': total_tests, | |
'passing_tests': len(passing_tests), | |
'failing_tests': len(failing_tests), | |
'any_pass': len(passing_tests) > 0, | |
'all_pass': len(failing_tests) == 0 and total_tests > 0, | |
'passing_test_names': passing_tests, | |
'failing_test_names': failing_tests, | |
} | |
if not passing_tests: | |
return False, 0, unit_test_output, coverage_output, test_stats | |
# If all tests pass, evaluate coverage immediately | |
if not failing_tests: | |
coverage = 0 | |
cov_success = False | |
if COVERAGE_PREFIX in test_output: | |
coverage_output = test_output.split(COVERAGE_PREFIX)[1] | |
_, coverage = check_coverage(coverage_output, test_spec.code_file) | |
cov_success = True | |
# test_stats['filtered_suite'] = test_suite | |
return cov_success, coverage, unit_test_output, coverage_output, test_stats | |
cov_success = False | |
coverage = 0 | |
# Second pass - run coverage on passing tests | |
if filtered_content: | |
with tempfile.TemporaryDirectory() as temp_dir: | |
test_suite_path = os.path.join(temp_dir, 'test_suite.py') | |
with open(test_suite_path, 'w') as f: | |
f.write(filtered_content) | |
runtime.copy_to(test_suite_path, '/tmp') | |
run_command(runtime, f'cp /tmp/test_suite.py /testbed/{test_spec.test_file}') | |
_, test_output_second_pass, _ = run_tests(runtime, instance, '/tmp/test.sh') | |
coverage, coverage_output, unit_test_output = 0, '', test_output_second_pass | |
if COVERAGE_PREFIX in test_output_second_pass: | |
coverage_output = test_output_second_pass.split(COVERAGE_PREFIX)[1] | |
unit_test_output = test_output_second_pass.split(TESTS_SUFFIX)[0] | |
_, coverage = check_coverage(coverage_output, test_spec.code_file) | |
cov_success = True | |
# test_stats['filtered_suite'] = filtered_content | |
return cov_success, coverage, unit_test_output, coverage_output, test_stats | |
def process_instance( | |
instance: pd.Series, | |
metadata: EvalMetadata, | |
reset_logger: bool = True, | |
log_dir: str | None = None, | |
) -> EvalOutput: | |
""" | |
Evaluate agent performance on a TestGenEval problem instance. | |
Note that this signature differs from the expected input to `run_evaluation`. Use | |
`functools.partial` to provide optional arguments before passing to the evaluation harness. | |
Args: | |
log_dir (str | None, default=None): Path to directory where log files will be written. Must | |
be provided if `reset_logger` is set. | |
Raises: | |
AssertionError: if the `reset_logger` flag is set without a provided log directory. | |
""" | |
if reset_logger: | |
assert log_dir is not None, ( | |
"Can't reset logger without a provided log directory." | |
) | |
os.makedirs(log_dir, exist_ok=True) | |
reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir) | |
else: | |
logger.info(f'Starting evaluation for instance {instance.instance_id}.') | |
config = get_config(instance) | |
id = instance.instance_id | |
logger.info(f'Starting evaluation for instance {id}.') | |
instance['test_result']['id'] = id | |
instance['test_result']['report'] = { | |
'test_output': '', | |
# 'coverage_output': '', | |
# 'mutation_output': '', | |
'empty_generation': False, | |
'error_eval': False, | |
'all_tests_pass': False, | |
'tests_pass': False, | |
'test_timeout': False, | |
'mutation_timeout': False, | |
'coverage_success': False, | |
'mutation_success': False, | |
'coverage': 0, | |
'mutation_score': 0, | |
'mutation_error_interval': -1, | |
'num_mutants': -1, | |
} | |
instance['test_result']['lexical'] = { | |
'pred_loc': -1, | |
'gold_loc': -1, | |
'pred_readability': -1, | |
'gold_readability': -1, | |
'pred_methods': -1, | |
'gold_methods': -1, | |
'bleu': -1, | |
'xmatch': -1, | |
'edit_sim': -1, | |
'rouge_f': -1, | |
'rouge_p': -1, | |
'rouge_r': -1, | |
} | |
if instance['test_suite'] == '' or instance['test_suite'] is None: | |
instance['test_result']['report']['empty_generation'] = True | |
return EvalOutput( | |
instance_id=instance.instance_id, test_result=instance['test_result'] | |
) | |
if not args.skip_lexical: | |
lexical_metrics = compute_lexical_metrics( | |
instance['test_suite'], instance['instance']['test_src'] | |
) | |
instance['test_result']['lexical'] = lexical_metrics | |
test_suite = instance['test_suite'] | |
test_spec: TestSpec = instance['test_spec'] | |
runtime = create_runtime(config) | |
call_async_from_sync(runtime.connect) | |
with tempfile.TemporaryDirectory() as temp_dir: | |
test_suite_path = os.path.join(temp_dir, 'test_suite.py') | |
with open(test_suite_path, 'w') as f: | |
f.write(test_suite) | |
runtime.copy_to(test_suite_path, '/tmp') | |
test_script_path = os.path.join(temp_dir, 'test.sh') | |
with open(test_script_path, 'w') as f: | |
f.write(test_spec.test_script) | |
runtime.copy_to(test_script_path, '/tmp') | |
mutation_script_path = os.path.join(temp_dir, 'mutation.sh') | |
with open(mutation_script_path, 'w') as f: | |
f.write(test_spec.mutation_script) | |
runtime.copy_to(mutation_script_path, '/tmp') | |
try: | |
run_command(runtime, 'chmod +x /tmp/test.sh /tmp/mutation.sh') | |
run_command(runtime, f'cp /tmp/test_suite.py /testbed/{test_spec.test_file}') | |
# First pass - run all tests | |
_, test_output, test_time = run_tests(runtime, instance, '/tmp/test.sh') | |
# Grade tests with two-pass approach | |
coverage_success, coverage, unit_test_output, coverage_output, test_stats = ( | |
grade_test_output(test_suite, instance, test_output, test_spec, runtime) | |
) | |
# Update report with test statistics | |
instance['test_result']['report'].update( | |
{ | |
'test_output': unit_test_output, | |
# 'coverage_output': coverage_output, | |
'tests_pass': test_stats['any_pass'], # Changed to use any_pass | |
'all_tests_pass': test_stats['all_pass'], # Added all_pass metric | |
'coverage_success': coverage_success, | |
'coverage': coverage if coverage_success else 0, | |
'test_stats': test_stats, | |
} | |
) | |
# Only run mutation testing if we have passing tests and coverage | |
if ( | |
not args.skip_mutation | |
and coverage_success | |
and test_stats['any_pass'] | |
and coverage > 0 | |
): | |
mutation_timeout = max(10, 1.5 * test_time) | |
mutation_toml = MUTATION_TEMPLATE.format( | |
test_cmd=test_spec.test_cmd, | |
source_fp=test_spec.code_file, | |
timeout=mutation_timeout, | |
) | |
with tempfile.TemporaryDirectory() as temp_dir: | |
mutation_toml_path = os.path.join(temp_dir, 'mutation.toml') | |
with open(mutation_toml_path, 'w') as f: | |
f.write(mutation_toml) | |
runtime.copy_to(mutation_toml_path, '/tmp') | |
run_command(runtime, 'cp /tmp/mutation.toml /testbed/mutation.toml') | |
mutation_code, mutation_output = run_mutation_testing( | |
runtime, instance, '/tmp/mutation.sh' | |
) | |
# instance['test_result']['report']['mutation_output'] = mutation_output | |
if mutation_output and mutation_code == 0: | |
( | |
mutation_success, | |
num_mutants, | |
mutation_score, | |
mutation_confidence_interval, | |
) = check_mutation(mutation_output) | |
instance['test_result']['report']['num_mutants'] = num_mutants | |
instance['test_result']['report']['mutation_success'] = mutation_success | |
instance['test_result']['report']['mutation_score'] = mutation_score | |
instance['test_result']['report']['mutation_error_interval'] = ( | |
mutation_confidence_interval | |
) | |
return EvalOutput( | |
instance_id=instance.instance_id, test_result=instance['test_result'] | |
) | |
except Exception as e: | |
logger.error(f'Error processing instance {instance.instance_id}: {e}') | |
raise RuntimeError( | |
instance.instance_id, | |
'Unexpected output...', | |
logger, | |
) | |
finally: | |
runtime.close() | |
def count_and_log_fields(evaluated_predictions, fields, key): | |
""" | |
Count and log the sum of specified fields in the evaluated predictions, | |
ignoring fields with a value of -1. If all values for a field are -1, | |
return -1. | |
:param evaluated_predictions: DataFrame containing evaluation results | |
:param fields: List of field names to count | |
:param key: Key to access the field values ('report' or 'lexical') | |
""" | |
def count_field(row, field): | |
value = row['test_result'][key][field] | |
return ( | |
value if value != -1 else None | |
) # Ignore -1 fields by treating them as None | |
for field in fields: | |
# Extract the valid values for the field, ignoring -1 | |
valid_values = evaluated_predictions.apply( | |
count_field, args=(field,), axis=1 | |
).dropna() | |
if valid_values.empty: # If all values are -1 | |
logger.info(f'# {field}: -1 (All values are -1)') | |
else: | |
count = valid_values.sum() # Sum of valid values | |
length = len(valid_values) # Count of valid entries | |
logger.info(f'# {field}: {length}. ({count / length:.2f})') | |
if __name__ == '__main__': | |
parser = get_parser() | |
parser.add_argument( | |
'--input-file', type=str, required=True, help='Path to input predictions file' | |
) | |
parser.add_argument( | |
'--dataset', | |
type=str, | |
default='kjain14/testgeneval', | |
help='Dataset to evaluate on', | |
) | |
parser.add_argument( | |
'--split', type=str, default='test', help='Split to evaluate on' | |
) | |
parser.add_argument( | |
'--skip_mutation', action='store_true', help='Skip mutation testing' | |
) | |
parser.add_argument( | |
'--skip_lexical', action='store_true', help='Skip lexical metrics' | |
) | |
parser.add_argument( | |
'--mutation_timeout', | |
type=int, | |
default=MUTATION_TIMEOUT, | |
help='Mutation timeout', | |
) | |
parser.add_argument( | |
'--mutation_buffer', | |
type=int, | |
default=MUTATION_BUFFER, | |
help='Mutation buffer', | |
) | |
args, _ = parser.parse_known_args() | |
dataset: list[TestGenEvalInstance] = load_testgeneval_dataset( | |
args.dataset, args.split | |
) | |
logger.info( | |
f'Loaded dataset {args.dataset} with split {args.split} to run inference on.' | |
) | |
# Load predictions | |
assert args.input_file.endswith('.jsonl'), 'Input file must be a jsonl file.' | |
predictions = pd.read_json(args.input_file, lines=True) | |
assert 'instance_id' in predictions.columns, ( | |
'Input file must contain instance_id column.' | |
) | |
if 'test_suite' not in predictions.columns and ( | |
'test_result' in predictions.columns | |
and 'test_suite' in predictions['test_result'].iloc(0) | |
): | |
raise ValueError( | |
'Input file must contain test_suite column OR test_result column with test_suite field.' | |
) | |
if 'instance_id_swebench' not in predictions.columns: | |
predictions['instance_id_swebench'] = predictions['instance'].apply( | |
lambda x: x['instance_id_swebench'] | |
) | |
if 'instance_id' not in predictions.columns and ( | |
'instance_id' in predictions['instance'].iloc(0) | |
): | |
raise ValueError( | |
'Input file must contain id column OR instance column with id field.' | |
) | |
if 'instance_id' not in predictions.columns: | |
predictions['instance_id'] = predictions['instance'].apply( | |
lambda x: x['instance_id'] | |
) | |
if 'test_suite' not in predictions.columns: | |
predictions['test_suite'] = predictions['test_result'].apply( | |
lambda x: x['test_suite'] | |
) | |
assert len(predictions['instance_id'].unique()) == len(predictions), ( | |
'instance_id column must be unique.' | |
) | |
assert {'instance_id_swebench', 'test_suite', 'instance_id'}.issubset( | |
set(predictions.columns) | |
), 'Input file must contain id, instance_id and test_suite columns.' | |
predictions['test_spec'] = predictions['instance'].apply( | |
lambda x: make_test_spec(x, args.mutation_timeout, args.mutation_buffer) | |
) | |
output_file = args.input_file.replace('.jsonl', '.testgeneval.jsonl') | |
instances = prepare_dataset(predictions, output_file, args.eval_n_limit) | |
# If possible, load the relevant metadata to avoid issues with `run_evaluation`. | |
metadata: EvalMetadata | None = None | |
metadata_filepath = os.path.join(os.path.dirname(args.input_file), 'metadata.json') | |
if os.path.exists(metadata_filepath): | |
with open(metadata_filepath, 'r') as metadata_file: | |
data = metadata_file.read() | |
metadata = EvalMetadata.model_validate_json(data) | |
# The evaluation harness constrains the signature of `process_instance_func` but we need to | |
# pass extra information. Build a new function object to avoid issues with multiprocessing. | |
process_instance_func = partial( | |
process_instance, log_dir=output_file.replace('.jsonl', '.logs') | |
) | |
run_evaluation( | |
instances, | |
metadata=None, | |
output_file=output_file, | |
num_workers=args.eval_num_workers, | |
process_instance_func=process_instance_func, | |
) | |
# Load evaluated predictions & print number of resolved predictions | |
evaluated_predictions = pd.read_json(output_file, lines=True) | |
report_fields = [ | |
'coverage', | |
'mutation_score', | |
'tests_pass', | |
'all_tests_pass', | |
'empty_generation', | |
'coverage_success', | |
'test_timeout', | |
'error_eval', | |
] | |
lexical_fields = [ | |
'pred_loc', | |
'gold_loc', | |
'pred_methods', | |
'gold_methods', | |
'bleu', | |
'xmatch', | |
'edit_sim', | |
'rouge_f', | |
'rouge_p', | |
'rouge_r', | |
] | |
# Log report and lexical fields | |
count_and_log_fields(evaluated_predictions, report_fields, key='report') | |
count_and_log_fields(evaluated_predictions, lexical_fields, key='lexical') | |