Spaces:

Backup-bdg
/

OpenHands

Build error

App Files Files Community

OpenHands / evaluation /benchmarks /testgeneval /scripts /eval /summarize_outputs.py

Backup-bdg

Upload 964 files

51ff9e5 verified 8 days ago

raw

history blame

4 kB

	#!/usr/bin/env python3
	import argparse
	import json
	from collections import Counter

	from openhands.events.serialization import event_from_dict
	from openhands.events.utils import get_pairs_from_events

	ERROR_KEYWORDS = [
	'Agent encountered an error while processing the last action',
	'APIError',
	'Action execution failed',
	]

	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('output_file', type=str, help='The file to summarize')
	args = parser.parse_args()

	with open(args.output_file, 'r') as file:
	lines = file.readlines()

	num_lines = len(lines)
	num_error_lines = 0
	num_agent_stuck_in_loop = 0

	coverage = 0
	mutation_score = 0
	num_empty_suite = 0

	error_counter = Counter()

	main_agent_cost = []
	editor_cost = []
	num_turns = []

	for line in lines:
	_d = json.loads(line)

	# Cost
	costs = _d['metrics'].get('costs', [])
	_cur_main_agent_cost = 0
	_cur_editor_cost = 0
	for cost in costs:
	if isinstance(cost, float):
	# backward compatible
	_cur_main_agent_cost += cost
	else:
	if 'draft_editor' in cost['model']:
	_cur_editor_cost += cost['cost']
	else:
	_cur_main_agent_cost += cost['cost']

	main_agent_cost.append(_cur_main_agent_cost)
	editor_cost.append(_cur_editor_cost)

	# Turn status
	history = _d.get('history', [])
	events = [event_from_dict(event) for event in history]
	pairs = get_pairs_from_events(events)
	num_turns.append(len(pairs))

	# Suite & resolve status
	suite = _d.get('test_result', {}).get('test_suite', '')
	if suite == '':
	num_empty_suite += 1
	continue

	report = _d.get('report', {}) or {}
	coverage += report.get('coverage', 0)
	mutation_score += report.get('mutation_score', 0)

	# Error
	error = _d.get('error', None)

	if error is not None and isinstance(error, str):
	agent_stuck_in_loop = 'Agent got stuck in a loop' in error
	contains_error = bool(error) and not agent_stuck_in_loop
	if agent_stuck_in_loop:
	error_counter['Agent got stuck in a loop'] += 1
	num_agent_stuck_in_loop += 1
	elif contains_error:
	error_counter[error] += 1
	continue

	for keyword in ERROR_KEYWORDS:
	if keyword in line:
	error_counter[keyword] += 1
	num_error_lines += 1
	break

	# print the error counter (with percentage)
	print(f'Average coverage for {num_lines} ({coverage / num_lines * 100:.2f}%)')
	print(
	f'Average mutation score for {num_lines} ({mutation_score / num_lines * 100:.2f}%)'
	)

	print(
	f'Number of empty suite: {num_empty_suite} / {num_lines} ({num_empty_suite / num_lines * 100:.2f}%)'
	)
	print(
	f'Number of error lines: {num_error_lines} / {num_lines} ({num_error_lines / num_lines * 100:.2f}%)'
	)
	print(
	f'Number of agent stuck in loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)'
	)
	assert len(num_turns) == num_lines
	assert len(main_agent_cost) == num_lines
	assert len(editor_cost) == num_lines
	print('## Statistics')
	print(f'Avg. num of turns per instance: {sum(num_turns) / num_lines:.2f}')
	print(f'Avg. agent cost per instance: {sum(main_agent_cost) / num_lines:.2f} USD')
	print(f'Avg. editor cost per instance: {sum(editor_cost) / num_lines:.2f} USD')
	print(
	f'Avg. total cost per instance: {(sum(main_agent_cost) + sum(editor_cost)) / num_lines:.2f} USD'
	)

	print('## Detailed error breakdown:')
	for error, count in error_counter.items():
	print(f'{error}: {count} ({count / num_lines * 100:.2f}%)')