import os import tempfile import time from functools import partial import pandas as pd from report_utils import ( check_coverage, check_mutation, count_methods, get_lines_of_code, ) from evaluation.benchmarks.testgeneval.compute_readability import compute_readability from evaluation.benchmarks.testgeneval.constants import ( COVERAGE_PREFIX, MUTATION_BUFFER, MUTATION_TEMPLATE, MUTATION_TIMEOUT, TESTS_SUFFIX, ) from evaluation.benchmarks.testgeneval.metrics import ( bleu, edit_sim, exact_match, rouge_l, ) from evaluation.benchmarks.testgeneval.pygments_utils import tokenize_code from evaluation.benchmarks.testgeneval.run_infer import get_instance_docker_image from evaluation.benchmarks.testgeneval.test_filter import filter_tests from evaluation.benchmarks.testgeneval.test_spec import ( TestGenEvalInstance, TestSpec, make_test_spec, ) from evaluation.benchmarks.testgeneval.utils import load_testgeneval_dataset from evaluation.utils.shared import ( EvalMetadata, EvalOutput, prepare_dataset, reset_logger_for_multiprocessing, run_evaluation, ) from openhands.core.config import OpenHandsConfig, SandboxConfig, get_parser from openhands.core.logger import openhands_logger as logger from openhands.core.main import create_runtime from openhands.events.action import CmdRunAction from openhands.events.observation import CmdOutputObservation from openhands.utils.async_utils import call_async_from_sync DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/kdjain/') logger.info(f'Using docker image prefix: {DOCKER_IMAGE_PREFIX}') def get_config(instance: pd.Series) -> OpenHandsConfig: base_container_image = get_instance_docker_image(instance['instance_id_swebench']) assert base_container_image, ( f'Invalid container image for instance {instance["instance_id_swebench"]}.' ) logger.info(f'Using instance container image: {base_container_image}.') return OpenHandsConfig( run_as_openhands=False, runtime=os.environ.get('RUNTIME', 'eventstream'), sandbox=SandboxConfig( base_container_image=base_container_image, use_host_network=False, timeout=1800, api_key=os.environ.get('ALLHANDS_API_KEY'), remote_runtime_api_url=os.environ.get( 'SANDBOX_REMOTE_RUNTIME_API_URL', 'http://localhost:8000' ), ), workspace_base=None, workspace_mount_path=None, ) def compute_lexical_metrics(pred_suite, gold_suite): pred_loc = get_lines_of_code(pred_suite) gold_loc = get_lines_of_code(gold_suite) pred_methods = count_methods(pred_suite) gold_methods = count_methods(gold_suite) readability_pred = compute_readability(pred_suite) readability_gold = compute_readability(gold_suite) preds = tokenize_code(pred_suite) golds = tokenize_code(gold_suite) return { 'pred_loc': pred_loc, 'gold_loc': gold_loc, 'pred_readability': readability_pred, 'gold_readability': readability_gold, 'pred_methods': pred_methods, 'gold_methods': gold_methods, 'bleu': bleu(preds, golds), 'xmatch': exact_match(preds, golds), 'edit_sim': edit_sim(preds, golds), 'rouge_f': rouge_l(golds, preds)['f'], 'rouge_p': rouge_l(golds, preds)['p'], 'rouge_r': rouge_l(golds, preds)['r'], } def run_command(runtime, command, timeout=600): action = CmdRunAction(command=command) action.set_hard_timeout(timeout) logger.info(action, extra={'msg_type': 'ACTION'}) obs = runtime.run_action(action) logger.info(obs, extra={'msg_type': 'OBSERVATION'}) assert obs.exit_code == 0 return obs def run_tests(runtime, instance, test_script, log_file='/tmp/test_output.log'): action = CmdRunAction(command=f'bash {test_script} > {log_file} 2>&1 & echo $!') action.set_hard_timeout(60) obs = runtime.run_action(action) assert isinstance(obs, CmdOutputObservation), 'Failed to start test script.' pid = obs.content.split()[-1].strip() logger.info(f'[{instance.instance_id}] Test process started with PID: {pid}') start_time = time.time() timeout = 1800 while True: elapsed_time = time.time() - start_time if elapsed_time > timeout: logger.info(f'[{instance.instance_id}] Test process timed out.') instance['test_result']['report']['test_timeout'] = True break check_action = CmdRunAction(command=f'ps -p {pid} > /dev/null; echo $?') check_obs = runtime.run_action(check_action) if ( isinstance(check_obs, CmdOutputObservation) and len(check_obs.content.split()) > 0 and check_obs.content.split()[-1].strip() == '1' ): logger.info(f'[{instance.instance_id}] Test process completed.') break time.sleep(30) test_action = CmdRunAction(command=f'cat {log_file}') test_action.set_hard_timeout(300) test_obs = runtime.run_action(test_action) assert isinstance(test_obs, CmdOutputObservation), 'Failed to retrieve test output.' return test_obs.exit_code, test_obs.content, elapsed_time def run_mutation_testing( runtime, instance, mutation_script, log_file='/tmp/mutation_output.log' ): action = CmdRunAction(command=f'bash {mutation_script} > {log_file} 2>&1 & echo $!') action.set_hard_timeout(60) obs = runtime.run_action(action) assert isinstance(obs, CmdOutputObservation), 'Failed to start test script.' pid = obs.content.split()[-1].strip() logger.info(f'[{instance.instance_id}] Mutation process started with PID: {pid}') start_time = time.time() timeout = 4000 while True: elapsed_time = time.time() - start_time if elapsed_time > timeout: logger.info(f'[{instance.instance_id}] Mutation process timed out.') instance['test_result']['report']['mutation_timeout'] = True break check_action = CmdRunAction(command=f'ps -p {pid} > /dev/null; echo $?') check_obs = runtime.run_action(check_action) if ( isinstance(check_obs, CmdOutputObservation) and len(check_obs.content.split()) > 0 and check_obs.content.split()[-1].strip() == '1' ): logger.info(f'[{instance.instance_id}] Mutation process completed.') break time.sleep(30) assert isinstance(obs, CmdOutputObservation), 'Failed to run mutation script.' mutation_action = CmdRunAction(command=f'cat {log_file}') mutation_action.set_hard_timeout(300) mutation_obs = runtime.run_action(mutation_action) assert isinstance(mutation_obs, CmdOutputObservation), ( 'Failed to retrieve mutation output.' ) return mutation_obs.exit_code, mutation_obs.content def grade_test_output( test_suite: str, instance: pd.Series, test_output: str, test_spec: TestSpec, runtime ): """ Two-pass test grading with short-circuiting: 1. Run all tests to identify passing/failing tests 2. If no failing tests, evaluate coverage immediately 3. Otherwise, run only passing tests for coverage analysis """ unit_test_output, coverage_output = '', '' if TESTS_SUFFIX in test_output: unit_test_output = test_output.split(TESTS_SUFFIX)[0] if not unit_test_output: return ( False, 0, '', '', { 'total_tests': 0, 'passing_tests': 0, 'failing_tests': 0, 'any_pass': False, 'all_pass': False, 'passing_test_names': [], 'failing_test_names': [], }, ) logger.info('Calling filter unit tests') filtered_content, passing_tests, failing_tests = filter_tests( test_suite, unit_test_output, test_spec.repo ) total_tests = len(passing_tests) + len(failing_tests) test_stats = { 'total_tests': total_tests, 'passing_tests': len(passing_tests), 'failing_tests': len(failing_tests), 'any_pass': len(passing_tests) > 0, 'all_pass': len(failing_tests) == 0 and total_tests > 0, 'passing_test_names': passing_tests, 'failing_test_names': failing_tests, } if not passing_tests: return False, 0, unit_test_output, coverage_output, test_stats # If all tests pass, evaluate coverage immediately if not failing_tests: coverage = 0 cov_success = False if COVERAGE_PREFIX in test_output: coverage_output = test_output.split(COVERAGE_PREFIX)[1] _, coverage = check_coverage(coverage_output, test_spec.code_file) cov_success = True # test_stats['filtered_suite'] = test_suite return cov_success, coverage, unit_test_output, coverage_output, test_stats cov_success = False coverage = 0 # Second pass - run coverage on passing tests if filtered_content: with tempfile.TemporaryDirectory() as temp_dir: test_suite_path = os.path.join(temp_dir, 'test_suite.py') with open(test_suite_path, 'w') as f: f.write(filtered_content) runtime.copy_to(test_suite_path, '/tmp') run_command(runtime, f'cp /tmp/test_suite.py /testbed/{test_spec.test_file}') _, test_output_second_pass, _ = run_tests(runtime, instance, '/tmp/test.sh') coverage, coverage_output, unit_test_output = 0, '', test_output_second_pass if COVERAGE_PREFIX in test_output_second_pass: coverage_output = test_output_second_pass.split(COVERAGE_PREFIX)[1] unit_test_output = test_output_second_pass.split(TESTS_SUFFIX)[0] _, coverage = check_coverage(coverage_output, test_spec.code_file) cov_success = True # test_stats['filtered_suite'] = filtered_content return cov_success, coverage, unit_test_output, coverage_output, test_stats def process_instance( instance: pd.Series, metadata: EvalMetadata, reset_logger: bool = True, log_dir: str | None = None, ) -> EvalOutput: """ Evaluate agent performance on a TestGenEval problem instance. Note that this signature differs from the expected input to `run_evaluation`. Use `functools.partial` to provide optional arguments before passing to the evaluation harness. Args: log_dir (str | None, default=None): Path to directory where log files will be written. Must be provided if `reset_logger` is set. Raises: AssertionError: if the `reset_logger` flag is set without a provided log directory. """ if reset_logger: assert log_dir is not None, ( "Can't reset logger without a provided log directory." ) os.makedirs(log_dir, exist_ok=True) reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir) else: logger.info(f'Starting evaluation for instance {instance.instance_id}.') config = get_config(instance) id = instance.instance_id logger.info(f'Starting evaluation for instance {id}.') instance['test_result']['id'] = id instance['test_result']['report'] = { 'test_output': '', # 'coverage_output': '', # 'mutation_output': '', 'empty_generation': False, 'error_eval': False, 'all_tests_pass': False, 'tests_pass': False, 'test_timeout': False, 'mutation_timeout': False, 'coverage_success': False, 'mutation_success': False, 'coverage': 0, 'mutation_score': 0, 'mutation_error_interval': -1, 'num_mutants': -1, } instance['test_result']['lexical'] = { 'pred_loc': -1, 'gold_loc': -1, 'pred_readability': -1, 'gold_readability': -1, 'pred_methods': -1, 'gold_methods': -1, 'bleu': -1, 'xmatch': -1, 'edit_sim': -1, 'rouge_f': -1, 'rouge_p': -1, 'rouge_r': -1, } if instance['test_suite'] == '' or instance['test_suite'] is None: instance['test_result']['report']['empty_generation'] = True return EvalOutput( instance_id=instance.instance_id, test_result=instance['test_result'] ) if not args.skip_lexical: lexical_metrics = compute_lexical_metrics( instance['test_suite'], instance['instance']['test_src'] ) instance['test_result']['lexical'] = lexical_metrics test_suite = instance['test_suite'] test_spec: TestSpec = instance['test_spec'] runtime = create_runtime(config) call_async_from_sync(runtime.connect) with tempfile.TemporaryDirectory() as temp_dir: test_suite_path = os.path.join(temp_dir, 'test_suite.py') with open(test_suite_path, 'w') as f: f.write(test_suite) runtime.copy_to(test_suite_path, '/tmp') test_script_path = os.path.join(temp_dir, 'test.sh') with open(test_script_path, 'w') as f: f.write(test_spec.test_script) runtime.copy_to(test_script_path, '/tmp') mutation_script_path = os.path.join(temp_dir, 'mutation.sh') with open(mutation_script_path, 'w') as f: f.write(test_spec.mutation_script) runtime.copy_to(mutation_script_path, '/tmp') try: run_command(runtime, 'chmod +x /tmp/test.sh /tmp/mutation.sh') run_command(runtime, f'cp /tmp/test_suite.py /testbed/{test_spec.test_file}') # First pass - run all tests _, test_output, test_time = run_tests(runtime, instance, '/tmp/test.sh') # Grade tests with two-pass approach coverage_success, coverage, unit_test_output, coverage_output, test_stats = ( grade_test_output(test_suite, instance, test_output, test_spec, runtime) ) # Update report with test statistics instance['test_result']['report'].update( { 'test_output': unit_test_output, # 'coverage_output': coverage_output, 'tests_pass': test_stats['any_pass'], # Changed to use any_pass 'all_tests_pass': test_stats['all_pass'], # Added all_pass metric 'coverage_success': coverage_success, 'coverage': coverage if coverage_success else 0, 'test_stats': test_stats, } ) # Only run mutation testing if we have passing tests and coverage if ( not args.skip_mutation and coverage_success and test_stats['any_pass'] and coverage > 0 ): mutation_timeout = max(10, 1.5 * test_time) mutation_toml = MUTATION_TEMPLATE.format( test_cmd=test_spec.test_cmd, source_fp=test_spec.code_file, timeout=mutation_timeout, ) with tempfile.TemporaryDirectory() as temp_dir: mutation_toml_path = os.path.join(temp_dir, 'mutation.toml') with open(mutation_toml_path, 'w') as f: f.write(mutation_toml) runtime.copy_to(mutation_toml_path, '/tmp') run_command(runtime, 'cp /tmp/mutation.toml /testbed/mutation.toml') mutation_code, mutation_output = run_mutation_testing( runtime, instance, '/tmp/mutation.sh' ) # instance['test_result']['report']['mutation_output'] = mutation_output if mutation_output and mutation_code == 0: ( mutation_success, num_mutants, mutation_score, mutation_confidence_interval, ) = check_mutation(mutation_output) instance['test_result']['report']['num_mutants'] = num_mutants instance['test_result']['report']['mutation_success'] = mutation_success instance['test_result']['report']['mutation_score'] = mutation_score instance['test_result']['report']['mutation_error_interval'] = ( mutation_confidence_interval ) return EvalOutput( instance_id=instance.instance_id, test_result=instance['test_result'] ) except Exception as e: logger.error(f'Error processing instance {instance.instance_id}: {e}') raise RuntimeError( instance.instance_id, 'Unexpected output...', logger, ) finally: runtime.close() def count_and_log_fields(evaluated_predictions, fields, key): """ Count and log the sum of specified fields in the evaluated predictions, ignoring fields with a value of -1. If all values for a field are -1, return -1. :param evaluated_predictions: DataFrame containing evaluation results :param fields: List of field names to count :param key: Key to access the field values ('report' or 'lexical') """ def count_field(row, field): value = row['test_result'][key][field] return ( value if value != -1 else None ) # Ignore -1 fields by treating them as None for field in fields: # Extract the valid values for the field, ignoring -1 valid_values = evaluated_predictions.apply( count_field, args=(field,), axis=1 ).dropna() if valid_values.empty: # If all values are -1 logger.info(f'# {field}: -1 (All values are -1)') else: count = valid_values.sum() # Sum of valid values length = len(valid_values) # Count of valid entries logger.info(f'# {field}: {length}. ({count / length:.2f})') if __name__ == '__main__': parser = get_parser() parser.add_argument( '--input-file', type=str, required=True, help='Path to input predictions file' ) parser.add_argument( '--dataset', type=str, default='kjain14/testgeneval', help='Dataset to evaluate on', ) parser.add_argument( '--split', type=str, default='test', help='Split to evaluate on' ) parser.add_argument( '--skip_mutation', action='store_true', help='Skip mutation testing' ) parser.add_argument( '--skip_lexical', action='store_true', help='Skip lexical metrics' ) parser.add_argument( '--mutation_timeout', type=int, default=MUTATION_TIMEOUT, help='Mutation timeout', ) parser.add_argument( '--mutation_buffer', type=int, default=MUTATION_BUFFER, help='Mutation buffer', ) args, _ = parser.parse_known_args() dataset: list[TestGenEvalInstance] = load_testgeneval_dataset( args.dataset, args.split ) logger.info( f'Loaded dataset {args.dataset} with split {args.split} to run inference on.' ) # Load predictions assert args.input_file.endswith('.jsonl'), 'Input file must be a jsonl file.' predictions = pd.read_json(args.input_file, lines=True) assert 'instance_id' in predictions.columns, ( 'Input file must contain instance_id column.' ) if 'test_suite' not in predictions.columns and ( 'test_result' in predictions.columns and 'test_suite' in predictions['test_result'].iloc(0) ): raise ValueError( 'Input file must contain test_suite column OR test_result column with test_suite field.' ) if 'instance_id_swebench' not in predictions.columns: predictions['instance_id_swebench'] = predictions['instance'].apply( lambda x: x['instance_id_swebench'] ) if 'instance_id' not in predictions.columns and ( 'instance_id' in predictions['instance'].iloc(0) ): raise ValueError( 'Input file must contain id column OR instance column with id field.' ) if 'instance_id' not in predictions.columns: predictions['instance_id'] = predictions['instance'].apply( lambda x: x['instance_id'] ) if 'test_suite' not in predictions.columns: predictions['test_suite'] = predictions['test_result'].apply( lambda x: x['test_suite'] ) assert len(predictions['instance_id'].unique()) == len(predictions), ( 'instance_id column must be unique.' ) assert {'instance_id_swebench', 'test_suite', 'instance_id'}.issubset( set(predictions.columns) ), 'Input file must contain id, instance_id and test_suite columns.' predictions['test_spec'] = predictions['instance'].apply( lambda x: make_test_spec(x, args.mutation_timeout, args.mutation_buffer) ) output_file = args.input_file.replace('.jsonl', '.testgeneval.jsonl') instances = prepare_dataset(predictions, output_file, args.eval_n_limit) # If possible, load the relevant metadata to avoid issues with `run_evaluation`. metadata: EvalMetadata | None = None metadata_filepath = os.path.join(os.path.dirname(args.input_file), 'metadata.json') if os.path.exists(metadata_filepath): with open(metadata_filepath, 'r') as metadata_file: data = metadata_file.read() metadata = EvalMetadata.model_validate_json(data) # The evaluation harness constrains the signature of `process_instance_func` but we need to # pass extra information. Build a new function object to avoid issues with multiprocessing. process_instance_func = partial( process_instance, log_dir=output_file.replace('.jsonl', '.logs') ) run_evaluation( instances, metadata=None, output_file=output_file, num_workers=args.eval_num_workers, process_instance_func=process_instance_func, ) # Load evaluated predictions & print number of resolved predictions evaluated_predictions = pd.read_json(output_file, lines=True) report_fields = [ 'coverage', 'mutation_score', 'tests_pass', 'all_tests_pass', 'empty_generation', 'coverage_success', 'test_timeout', 'error_eval', ] lexical_fields = [ 'pred_loc', 'gold_loc', 'pred_methods', 'gold_methods', 'bleu', 'xmatch', 'edit_sim', 'rouge_f', 'rouge_p', 'rouge_r', ] # Log report and lexical fields count_and_log_fields(evaluated_predictions, report_fields, key='report') count_and_log_fields(evaluated_predictions, lexical_fields, key='lexical')