Spaces:
Build error
Build error
import json | |
import pprint | |
import sys | |
def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]: | |
passed = [] | |
failed = [] | |
costs = [] | |
instance_ids = set() | |
instances = [] | |
with open(res_file_path, 'r') as file: | |
for line in file: | |
data = json.loads(line.strip()) | |
success = data['metrics']['success'] | |
if data['instance_id'] in instance_ids: | |
print(f'WARNING: Duplicate instance_id found: {data["instance_id"]}') | |
continue | |
instance_ids.add(data['instance_id']) | |
instances.append(data) | |
if success: | |
passed.append( | |
{ | |
'instance_id': data['instance_id'], | |
'repo': data['repo'], | |
'instruction': data['instruction'], | |
'eval_script': data['eval_script'], | |
'eval_exit_code': data['eval_exit_code'], | |
'eval_output': data['eval_output'], | |
'accumulated_cost': data['metrics']['accumulated_cost'], | |
} | |
) | |
else: | |
failed.append( | |
{ | |
'instance_id': data['instance_id'], | |
'repo': data['repo'], | |
'instruction': data['instruction'], | |
'eval_script': data['eval_script'], | |
'eval_exit_code': data['eval_exit_code'], | |
'eval_output': data['eval_output'], | |
'accumulated_cost': data['metrics']['accumulated_cost'], | |
} | |
) | |
costs.append(data['metrics']['accumulated_cost']) | |
# sort by instance_id | |
instances.sort(key=lambda x: x['instance_id']) | |
with open(res_file_path, 'w') as file: | |
for instance in instances: | |
file.write(json.dumps(instance) + '\n') | |
return passed, failed, costs | |
if __name__ == '__main__': | |
if len(sys.argv) != 2: | |
print( | |
'Usage: poetry run python summarise_results.py <path_to_output_jsonl_file>' | |
) | |
sys.exit(1) | |
json_file_path = sys.argv[1] | |
passed_tests, failed_tests, costs = extract_test_results(json_file_path) | |
success_rate = len(passed_tests) / (len(passed_tests) + len(failed_tests)) | |
print('PASSED TESTS:') | |
pprint.pprint(passed_tests) | |
print('FAILED TESTS:') | |
pprint.pprint(failed_tests) | |
print( | |
f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, success rate = {success_rate}, average cost = {sum(costs) / len(costs)}' | |
) | |