File size: 6,503 Bytes
d26280a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
from pathlib import Path

from gpt_engineer.benchmark.benchmarks.gpteng.eval_tools import (
    check_evaluation_component,
)
from gpt_engineer.benchmark.types import Assertable, Benchmark, Task
from gpt_engineer.core.chat_to_files import chat_to_files_dict

evaluations = [
    {
        "name": "simple_code_modify",
        "project_root": "projects/snake_game_eval",
        "code_blob": "gpt_engineer/benchmark/benchmarks/gpteng/known_code_blobs/snake_game_files.txt",
        "improve_code_prompt": "The grid is currently 10x10, change the grid to be 42x42.",
        "expected_results": [
            {
                "type": "assert_exists_in_source_code",
                "source_file": "grid.py",
                "existing_string": "width=42",
            },
            {
                "type": "assert_exists_in_source_code",
                "source_file": "grid.py",
                "existing_string": "height=42",
            },
            {
                "type": "run_code_class_has_property",
                "language": "python",
                "source_file": "grid.py",
                "class_name": "Grid",
                "property_name": "height",
            },
            {
                "type": "run_code_class_has_property",
                "language": "python",
                "source_file": "grid.py",
                "class_name": "Grid",
                "property_name": "width",
            },
            {
                "type": "run_code_class_has_property_w_value",
                "language": "python",
                "source_file": "grid.py",
                "class_name": "Grid",
                "property_name": "height",
                "expected_value": 42,
            },
            {
                "type": "run_code_class_has_property_w_value",
                "language": "python",
                "source_file": "grid.py",
                "class_name": "Grid",
                "property_name": "width",
                "expected_value": 42,
            },
        ],
    },
    {
        "name": "modify_web_app_appearance",
        "project_root": "projects/web_todo_list",
        "code_blob": "gpt_engineer/benchmark/benchmarks/gpteng/known_code_blobs/web_todo_files.txt",
        "improve_code_prompt": "Fix the margins around the form to be 45px, and make the background color orange.",
        "expected_results": [
            {
                "type": "assert_exists_in_source_code",
                "source_file": "styles.css",
                "existing_string": "#task-form {\\n    margin: 45px;",
            },
            {
                "type": "assert_exists_in_source_code",
                "source_file": "styles.css",
                "existing_string": "background-color: orange;",
            },
        ],
    },
    {
        "name": "modify_functionality",
        "project_root": "projects/snake_game_eval",
        "code_blob": "gpt_engineer/benchmark/benchmarks/gpteng/known_code_blobs/snake_game_files.txt",
        "improve_code_prompt": "Add a 2 second delay before the game starts.",
        "expected_results": [
            {
                "type": "assert_exists_in_source_code",
                "source_file": "game.py",
                "existing_string": "time.sleep(2)",
            }
        ],
    },
]

# Not supporting execution paths that used to exist
# evaluations = [
#     {
#         "name": "currency_converter",
#         "project_root": "projects/currency_converter",
#         "code_prompt": "Build a currency converter CLI tool in Python using an API for exchange rates.  The currency converter should be a python program named currency.py with three required arguments: base currency symbol, target currency symbol and base currency amount.  The currency converter will convert the amount in base currency amount to the target currency.  The output of the program should only be the amount of target currency.  For example the following command: `python currency.py USD CNY 1` should return a number like 7.5.",
#         "expected_results": [
#             {
#                 "type": "check_executable_exits_normally",
#                 "executable_name": "python currency.py",
#                 "executable_arguments": "USD CAD 10"
#             },
#             {
#                 "type": "check_executable_satisfies_function",
#                 "executable_name": "python currency.py",
#                 "executable_arguments": "USD CAD 10",
#                 "output_satisfies": "tf = lambda a : a.replace('.', '').isnumeric()"
#             }
#         ]
#     },
#     {
#         "name": "password_gen",
#         "project_root": "projects/password_gen_eval",
#         "code_prompt": "Create a password generator CLI tool in Python that generates strong, random passwords based on user-specified criteria, such as length and character types (letters, numbers, symbols).  The password generator should be a python program named passwordgenerator.py with two arguments: length, and character types.  The character types argument can be one or more of the the following: l for lowercase, u for uppercase, d for digits, and s for symbols.",
#         "expected_results": [
#             {
#                 "type": "check_executable_exits_normally",
#                 "executable_name": "python passwordgenerator.py",
#                 "executable_arguments": "10 d"
#             },
#             {
#                 "type": "check_executable_satisfies_function",
#                 "executable_name": "python passwordgenerator.py",
#                 "executable_arguments": "10 d",
#                 "output_satisfies": "tf = lambda a : len(a) == 10"
#             }
#         ]
#     }
# ]
#


def expect_to_assertion(expected_result):
    def assertion(assertable: Assertable):
        return check_evaluation_component(expected_result, assertable.files)

    return assertion


def eval_to_task(case):
    if "improve_code_prompt" in case:
        prompt = case["improve_code_prompt"]
    else:
        prompt = case["code_prompt"]

    return Task(
        name=case["name"],
        initial_code=chat_to_files_dict(Path(case["code_blob"]).read_text()),
        prompt=prompt,
        command=None,
        assertions={
            f"{e['type']}_{i}": expect_to_assertion(e)
            for i, e in enumerate(case["expected_results"])
        },
    )


def load_gpteng():
    return Benchmark(
        name="gpte_eval", tasks=[eval_to_task(case) for case in evaluations]
    )