Spaces:
Build error
Build error
#!/usr/bin/env python3 | |
import argparse | |
import json | |
from collections import Counter | |
from openhands.events.serialization import event_from_dict | |
from openhands.events.utils import get_pairs_from_events | |
ERROR_KEYWORDS = [ | |
'Agent encountered an error while processing the last action', | |
'APIError', | |
'Action execution failed', | |
] | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('output_file', type=str, help='The file to summarize') | |
args = parser.parse_args() | |
with open(args.output_file, 'r') as file: | |
lines = file.readlines() | |
num_lines = len(lines) | |
num_error_lines = 0 | |
num_agent_stuck_in_loop = 0 | |
coverage = 0 | |
mutation_score = 0 | |
num_empty_suite = 0 | |
error_counter = Counter() | |
main_agent_cost = [] | |
editor_cost = [] | |
num_turns = [] | |
for line in lines: | |
_d = json.loads(line) | |
# Cost | |
costs = _d['metrics'].get('costs', []) | |
_cur_main_agent_cost = 0 | |
_cur_editor_cost = 0 | |
for cost in costs: | |
if isinstance(cost, float): | |
# backward compatible | |
_cur_main_agent_cost += cost | |
else: | |
if 'draft_editor' in cost['model']: | |
_cur_editor_cost += cost['cost'] | |
else: | |
_cur_main_agent_cost += cost['cost'] | |
main_agent_cost.append(_cur_main_agent_cost) | |
editor_cost.append(_cur_editor_cost) | |
# Turn status | |
history = _d.get('history', []) | |
events = [event_from_dict(event) for event in history] | |
pairs = get_pairs_from_events(events) | |
num_turns.append(len(pairs)) | |
# Suite & resolve status | |
suite = _d.get('test_result', {}).get('test_suite', '') | |
if suite == '': | |
num_empty_suite += 1 | |
continue | |
report = _d.get('report', {}) or {} | |
coverage += report.get('coverage', 0) | |
mutation_score += report.get('mutation_score', 0) | |
# Error | |
error = _d.get('error', None) | |
if error is not None and isinstance(error, str): | |
agent_stuck_in_loop = 'Agent got stuck in a loop' in error | |
contains_error = bool(error) and not agent_stuck_in_loop | |
if agent_stuck_in_loop: | |
error_counter['Agent got stuck in a loop'] += 1 | |
num_agent_stuck_in_loop += 1 | |
elif contains_error: | |
error_counter[error] += 1 | |
continue | |
for keyword in ERROR_KEYWORDS: | |
if keyword in line: | |
error_counter[keyword] += 1 | |
num_error_lines += 1 | |
break | |
# print the error counter (with percentage) | |
print(f'Average coverage for {num_lines} ({coverage / num_lines * 100:.2f}%)') | |
print( | |
f'Average mutation score for {num_lines} ({mutation_score / num_lines * 100:.2f}%)' | |
) | |
print( | |
f'Number of empty suite: {num_empty_suite} / {num_lines} ({num_empty_suite / num_lines * 100:.2f}%)' | |
) | |
print( | |
f'Number of error lines: {num_error_lines} / {num_lines} ({num_error_lines / num_lines * 100:.2f}%)' | |
) | |
print( | |
f'Number of agent stuck in loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)' | |
) | |
assert len(num_turns) == num_lines | |
assert len(main_agent_cost) == num_lines | |
assert len(editor_cost) == num_lines | |
print('## Statistics') | |
print(f'Avg. num of turns per instance: {sum(num_turns) / num_lines:.2f}') | |
print(f'Avg. agent cost per instance: {sum(main_agent_cost) / num_lines:.2f} USD') | |
print(f'Avg. editor cost per instance: {sum(editor_cost) / num_lines:.2f} USD') | |
print( | |
f'Avg. total cost per instance: {(sum(main_agent_cost) + sum(editor_cost)) / num_lines:.2f} USD' | |
) | |
print('## Detailed error breakdown:') | |
for error, count in error_counter.items(): | |
print(f'{error}: {count} ({count / num_lines * 100:.2f}%)') | |