File size: 2,774 Bytes
246d201
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import json
import pprint
import sys


def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]:
    passed = []
    failed = []
    costs = []
    instance_ids = set()
    instances = []
    with open(res_file_path, 'r') as file:
        for line in file:
            data = json.loads(line.strip())
            success = data['metrics']['success']
            if data['instance_id'] in instance_ids:
                print(f'WARNING: Duplicate instance_id found: {data["instance_id"]}')
                continue
            instance_ids.add(data['instance_id'])
            instances.append(data)
            if success:
                passed.append(
                    {
                        'instance_id': data['instance_id'],
                        'repo': data['repo'],
                        'instruction': data['instruction'],
                        'eval_script': data['eval_script'],
                        'eval_exit_code': data['eval_exit_code'],
                        'eval_output': data['eval_output'],
                        'accumulated_cost': data['metrics']['accumulated_cost'],
                    }
                )
            else:
                failed.append(
                    {
                        'instance_id': data['instance_id'],
                        'repo': data['repo'],
                        'instruction': data['instruction'],
                        'eval_script': data['eval_script'],
                        'eval_exit_code': data['eval_exit_code'],
                        'eval_output': data['eval_output'],
                        'accumulated_cost': data['metrics']['accumulated_cost'],
                    }
                )
            costs.append(data['metrics']['accumulated_cost'])

        # sort by instance_id
        instances.sort(key=lambda x: x['instance_id'])
        with open(res_file_path, 'w') as file:
            for instance in instances:
                file.write(json.dumps(instance) + '\n')
        return passed, failed, costs


if __name__ == '__main__':
    if len(sys.argv) != 2:
        print(
            'Usage: poetry run python summarise_results.py <path_to_output_jsonl_file>'
        )
        sys.exit(1)
    json_file_path = sys.argv[1]
    passed_tests, failed_tests, costs = extract_test_results(json_file_path)
    success_rate = len(passed_tests) / (len(passed_tests) + len(failed_tests))
    print('PASSED TESTS:')
    pprint.pprint(passed_tests)
    print('FAILED TESTS:')
    pprint.pprint(failed_tests)
    print(
        f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, success rate = {success_rate}, average cost = {sum(costs) / len(costs)}'
    )