Spaces:
Build error
Build error
#!/usr/bin/env python3 | |
import argparse | |
import glob | |
import json | |
import os | |
import random | |
from collections import Counter | |
import numpy as np | |
import pandas as pd | |
from openhands.events.serialization import event_from_dict | |
from openhands.events.utils import get_pairs_from_events | |
ERROR_KEYWORDS = [ | |
'Agent encountered an error while processing the last action', | |
'APIError', | |
'Action execution failed', | |
'litellm.Timeout: APITimeoutError', | |
] | |
def get_bootstrap_accuracy_error_bars( | |
values: float | int | bool, num_samples: int = 1000, p_value=0.05 | |
) -> tuple[float, float]: | |
sorted_vals = np.sort( | |
[np.mean(random.sample(values, len(values) // 2)) for _ in range(num_samples)] | |
) | |
bottom_idx = int(num_samples * p_value / 2) | |
top_idx = int(num_samples * (1.0 - p_value / 2)) | |
return (sorted_vals[bottom_idx], sorted_vals[top_idx]) | |
def process_file(file_path): | |
with open(file_path, 'r') as file: | |
lines = file.readlines() | |
num_lines = len(lines) | |
num_error_lines = 0 | |
num_agent_stuck_in_loop = 0 | |
num_resolved = 0 | |
resolved_arr = [] | |
num_empty_patch = 0 | |
num_unfinished_runs = 0 | |
error_counter = Counter() | |
main_agent_cost = [] | |
editor_cost = [] | |
num_turns = [] | |
for line in lines: | |
_d = json.loads(line) | |
if 'metrics' not in _d or _d['metrics'] is None: | |
# this is a failed run | |
num_unfinished_runs += 1 | |
continue | |
# Cost | |
costs = _d['metrics'].get('costs', []) | |
_cur_main_agent_cost = 0 | |
_cur_editor_cost = 0 | |
for cost in costs: | |
if isinstance(cost, float): | |
# backward compatible | |
_cur_main_agent_cost += cost | |
else: | |
if 'draft_editor' in cost['model']: | |
_cur_editor_cost += cost['cost'] | |
else: | |
_cur_main_agent_cost += cost['cost'] | |
main_agent_cost.append(_cur_main_agent_cost) | |
editor_cost.append(_cur_editor_cost) | |
# Turn status | |
history = _d.get('history', []) | |
events = [event_from_dict(event) for event in history] | |
pairs = get_pairs_from_events(events) | |
num_turns.append(len(pairs)) | |
# Patch & resolve status | |
patch = _d.get('test_result', {}).get('git_patch', '') | |
if patch == '': | |
num_empty_patch += 1 | |
continue | |
report = _d.get('report', {}) or {} | |
resolved = report.get('resolved', False) | |
if resolved: | |
num_resolved += 1 | |
resolved_arr.append(1) | |
else: | |
resolved_arr.append(0) | |
# Error | |
error = _d.get('error', None) | |
if error is not None and isinstance(error, str): | |
agent_stuck_in_loop = 'Agent got stuck in a loop' in error | |
contains_error = bool(error) and not agent_stuck_in_loop | |
if agent_stuck_in_loop: | |
error_counter['Agent got stuck in a loop'] += 1 | |
num_agent_stuck_in_loop += 1 | |
elif contains_error: | |
error_counter[error] += 1 | |
continue | |
for keyword in ERROR_KEYWORDS: | |
if keyword in line: | |
error_counter[keyword] += 1 | |
num_error_lines += 1 | |
break | |
return { | |
'file_path': file_path, | |
'total_instances': num_lines, | |
'resolved': { | |
'count': num_resolved, | |
'percentage': (num_resolved / num_lines * 100) if num_lines > 0 else 0, | |
'ci': tuple( | |
x * 100 for x in get_bootstrap_accuracy_error_bars(resolved_arr) | |
), | |
}, | |
'empty_patches': { | |
'count': num_empty_patch, | |
'percentage': (num_empty_patch / num_lines * 100) if num_lines > 0 else 0, | |
}, | |
'unfinished_runs': { | |
'count': num_unfinished_runs, | |
'percentage': (num_unfinished_runs / num_lines * 100) | |
if num_lines > 0 | |
else 0, | |
}, | |
'errors': { | |
'total': num_error_lines, | |
'percentage': (num_error_lines / num_lines * 100) if num_lines > 0 else 0, | |
'stuck_in_loop': { | |
'count': num_agent_stuck_in_loop, | |
'percentage': (num_agent_stuck_in_loop / num_lines * 100) | |
if num_lines > 0 | |
else 0, | |
}, | |
'breakdown': { | |
str(error): { | |
'count': count, | |
'percentage': (count / num_lines * 100) if num_lines > 0 else 0, | |
} | |
for error, count in error_counter.items() | |
}, | |
}, | |
'costs': { | |
'main_agent': sum(main_agent_cost), | |
'editor': sum(editor_cost), | |
'total': sum(main_agent_cost) + sum(editor_cost), | |
}, | |
'statistics': { | |
'avg_turns': sum(num_turns) / num_lines if num_lines > 0 else 0, | |
'costs': { | |
'main_agent': sum(main_agent_cost) / num_lines if num_lines > 0 else 0, | |
'editor': sum(editor_cost) / num_lines if num_lines > 0 else 0, | |
'total': (sum(main_agent_cost) + sum(editor_cost)) / num_lines | |
if num_lines > 0 | |
else 0, | |
}, | |
}, | |
} | |
def aggregate_directory(input_path) -> pd.DataFrame: | |
# Process all output.jsonl files in subdirectories | |
pattern = os.path.join(input_path, '**/output.jsonl') | |
files = glob.glob(pattern, recursive=True) | |
print(f'Processing {len(files)} files from directory {input_path}') | |
# Process each file silently and collect results | |
results = [] | |
for file_path in files: | |
try: | |
result = process_file(file_path) | |
results.append(result) | |
except Exception as e: | |
print(f'Error processing {file_path}: {str(e)}') | |
import traceback | |
traceback.print_exc() | |
continue | |
# Convert results to pandas DataFrame and sort by resolve rate | |
df = pd.DataFrame(results) | |
# Extract directory name from file path | |
df['directory'] = df['file_path'].apply( | |
lambda x: os.path.basename(os.path.dirname(x)) | |
) | |
df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage']) | |
df['resolve_rate_ci'] = df['resolved'].apply(lambda x: x['ci']) | |
df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage']) | |
df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage']) | |
df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns']) | |
df['error_rate'] = df['errors'].apply(lambda x: x['percentage']) | |
df['avg_cost'] = df['statistics'].apply(lambda x: x['costs']['total']) | |
df = df.sort_values('resolve_rate', ascending=False) | |
return df | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
'input_path', type=str, help='The file or directory to summarize' | |
) | |
parser.add_argument( | |
'--output', | |
type=str, | |
help='Output JSONL file for results', | |
default='summary_results.jsonl', | |
) | |
args = parser.parse_args() | |
if os.path.isdir(args.input_path): | |
df = aggregate_directory(args.input_path) | |
# Create the summary string | |
columns = [ | |
'directory', | |
'resolve_rate', | |
'empty_patch_rate', | |
'unfinished_rate', | |
'error_rate', | |
'avg_turns', | |
'avg_cost', | |
'total_instances', | |
] | |
summary_str = df[columns].to_string( | |
float_format=lambda x: '{:.2f}'.format(x), | |
formatters={ | |
'directory': lambda x: x[:90] | |
}, # Truncate directory names to 20 chars | |
index=False, | |
) | |
# Print to console | |
print('\nResults summary (sorted by resolve rate):') | |
print(summary_str) | |
# Save to text file | |
txt_output = args.output.rsplit('.', 1)[0] + '.txt' | |
with open(txt_output, 'w') as f: | |
f.write('Results summary (sorted by resolve rate):\n') | |
f.write(summary_str) | |
# Save | |
df.to_json(args.output, lines=True, orient='records') | |
df[columns].to_csv(args.output.rsplit('.', 1)[0] + '.csv', index=False) | |
else: | |
# Process single file with detailed output | |
results = [] | |
try: | |
result = process_file(args.input_path) | |
results.append(result) | |
# Print detailed results for single file | |
print(f'\nResults for {args.input_path}:') | |
print( | |
f'Number of resolved: {result["resolved"]["count"]} / {result["total_instances"]} ({result["resolved"]["percentage"]:.2f}% [{result["resolved"]["ci"][0]:.2f}%, {result["resolved"]["ci"][1]:.2f}%])' | |
) | |
print( | |
f'Number of empty patch: {result["empty_patches"]["count"]} / {result["total_instances"]} ({result["empty_patches"]["percentage"]:.2f}%)' | |
) | |
print( | |
f'Number of error lines: {result["errors"]["total"]} / {result["total_instances"]} ({result["errors"]["percentage"]:.2f}%)' | |
) | |
print( | |
f'Number of agent stuck in loop: {result["errors"]["stuck_in_loop"]["count"]} / {result["total_instances"]} ({result["errors"]["stuck_in_loop"]["percentage"]:.2f}%)' | |
) | |
print( | |
f'Number of unfinished runs: {result["unfinished_runs"]["count"]} / {result["total_instances"]} ({result["unfinished_runs"]["percentage"]:.2f}%)' | |
) | |
print(f'Total cost: {result["costs"]["total"]:.2f} USD') | |
print('## Statistics') | |
print( | |
f'Avg. num of turns per instance: {result["statistics"]["avg_turns"]:.2f}' | |
) | |
print( | |
f'Avg. agent cost per instance: {result["statistics"]["costs"]["main_agent"]:.2f} USD' | |
) | |
print( | |
f'Avg. editor cost per instance: {result["statistics"]["costs"]["editor"]:.2f} USD' | |
) | |
print( | |
f'Avg. total cost per instance: {result["statistics"]["costs"]["total"]:.2f} USD' | |
) | |
print('## Detailed error breakdown:') | |
for error, data in result['errors']['breakdown'].items(): | |
print(f'{error}: {data["count"]} ({data["percentage"]:.2f}%)') | |
except Exception as e: | |
print(f'Error processing {args.input_path}: {str(e)}') | |