File size: 2,774 Bytes
246d201 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import json
import pprint
import sys
def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]:
passed = []
failed = []
costs = []
instance_ids = set()
instances = []
with open(res_file_path, 'r') as file:
for line in file:
data = json.loads(line.strip())
success = data['metrics']['success']
if data['instance_id'] in instance_ids:
print(f'WARNING: Duplicate instance_id found: {data["instance_id"]}')
continue
instance_ids.add(data['instance_id'])
instances.append(data)
if success:
passed.append(
{
'instance_id': data['instance_id'],
'repo': data['repo'],
'instruction': data['instruction'],
'eval_script': data['eval_script'],
'eval_exit_code': data['eval_exit_code'],
'eval_output': data['eval_output'],
'accumulated_cost': data['metrics']['accumulated_cost'],
}
)
else:
failed.append(
{
'instance_id': data['instance_id'],
'repo': data['repo'],
'instruction': data['instruction'],
'eval_script': data['eval_script'],
'eval_exit_code': data['eval_exit_code'],
'eval_output': data['eval_output'],
'accumulated_cost': data['metrics']['accumulated_cost'],
}
)
costs.append(data['metrics']['accumulated_cost'])
# sort by instance_id
instances.sort(key=lambda x: x['instance_id'])
with open(res_file_path, 'w') as file:
for instance in instances:
file.write(json.dumps(instance) + '\n')
return passed, failed, costs
if __name__ == '__main__':
if len(sys.argv) != 2:
print(
'Usage: poetry run python summarise_results.py <path_to_output_jsonl_file>'
)
sys.exit(1)
json_file_path = sys.argv[1]
passed_tests, failed_tests, costs = extract_test_results(json_file_path)
success_rate = len(passed_tests) / (len(passed_tests) + len(failed_tests))
print('PASSED TESTS:')
pprint.pprint(passed_tests)
print('FAILED TESTS:')
pprint.pprint(failed_tests)
print(
f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, success rate = {success_rate}, average cost = {sum(costs) / len(costs)}'
)
|