Spaces:
Build error
Build error
import argparse | |
import numpy as np | |
import pandas as pd | |
def extract_test_results(df: pd.DataFrame) -> tuple[list[str], list[str]]: | |
passed = [] | |
failed = [] | |
for _, row in df.iterrows(): | |
instance_id = row['instance_id'] | |
resolved = False | |
if 'test_result' in row and 'exit_code' in row['test_result']: | |
resolved = row['test_result']['exit_code'] == 0 | |
if resolved: | |
passed.append(instance_id) | |
else: | |
failed.append(instance_id) | |
return passed, failed | |
def visualize_results(df: pd.DataFrame): | |
df1 = pd.DataFrame() | |
df1['cost'] = df['metrics'].apply(pd.Series)['accumulated_cost'] | |
df1['result'] = ( | |
df['test_result'].apply(pd.Series)['exit_code'].map({0: 'Pass', 1: 'Fail'}) | |
) | |
df1['actions'] = pd.Series([len(a) - 1 for a in df['history']]) | |
passed = np.sum(df1['result'] == 'Pass') | |
total = df.shape[0] | |
resolve_rate = round((passed / total) * 100, 2) | |
print('Number of passed tests:', f'{passed}/{total} {resolve_rate:.2f}%') | |
print('\nDescriptive statistics for number of actions:') | |
print(df1['actions'].describe()) | |
print('\nDescriptive statistics for costs:') | |
print(df1['cost'].describe()) | |
# Bin counts for actions | |
action_bins = pd.cut(df1['actions'], bins=range(0, 32, 2)) | |
print('\nAction bin counts:') | |
print(action_bins.value_counts().sort_index()) | |
# Bin counts for costs | |
cost_bins = pd.cut(df1['cost'], bins=10) | |
print('\nCost bin counts:') | |
print(cost_bins.value_counts().sort_index()) | |
return resolve_rate | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='Summarize AiderBench results') | |
parser.add_argument('input_filepath', type=str, help='Path to the JSONL file') | |
args = parser.parse_args() | |
# Create DataFrame from JSONL file | |
df = pd.read_json(args.input_filepath, lines=True) | |
passed_tests, failed_tests = extract_test_results(df) | |
resolve_rate = visualize_results(df) | |
print( | |
f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, resolve rate = {resolve_rate:.2f}%' | |
) | |
print('PASSED TESTS:') | |
print(passed_tests) | |
print('FAILED TESTS:') | |
print(failed_tests) | |