File size: 6,503 Bytes
d26280a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
from pathlib import Path
from gpt_engineer.benchmark.benchmarks.gpteng.eval_tools import (
check_evaluation_component,
)
from gpt_engineer.benchmark.types import Assertable, Benchmark, Task
from gpt_engineer.core.chat_to_files import chat_to_files_dict
evaluations = [
{
"name": "simple_code_modify",
"project_root": "projects/snake_game_eval",
"code_blob": "gpt_engineer/benchmark/benchmarks/gpteng/known_code_blobs/snake_game_files.txt",
"improve_code_prompt": "The grid is currently 10x10, change the grid to be 42x42.",
"expected_results": [
{
"type": "assert_exists_in_source_code",
"source_file": "grid.py",
"existing_string": "width=42",
},
{
"type": "assert_exists_in_source_code",
"source_file": "grid.py",
"existing_string": "height=42",
},
{
"type": "run_code_class_has_property",
"language": "python",
"source_file": "grid.py",
"class_name": "Grid",
"property_name": "height",
},
{
"type": "run_code_class_has_property",
"language": "python",
"source_file": "grid.py",
"class_name": "Grid",
"property_name": "width",
},
{
"type": "run_code_class_has_property_w_value",
"language": "python",
"source_file": "grid.py",
"class_name": "Grid",
"property_name": "height",
"expected_value": 42,
},
{
"type": "run_code_class_has_property_w_value",
"language": "python",
"source_file": "grid.py",
"class_name": "Grid",
"property_name": "width",
"expected_value": 42,
},
],
},
{
"name": "modify_web_app_appearance",
"project_root": "projects/web_todo_list",
"code_blob": "gpt_engineer/benchmark/benchmarks/gpteng/known_code_blobs/web_todo_files.txt",
"improve_code_prompt": "Fix the margins around the form to be 45px, and make the background color orange.",
"expected_results": [
{
"type": "assert_exists_in_source_code",
"source_file": "styles.css",
"existing_string": "#task-form {\\n margin: 45px;",
},
{
"type": "assert_exists_in_source_code",
"source_file": "styles.css",
"existing_string": "background-color: orange;",
},
],
},
{
"name": "modify_functionality",
"project_root": "projects/snake_game_eval",
"code_blob": "gpt_engineer/benchmark/benchmarks/gpteng/known_code_blobs/snake_game_files.txt",
"improve_code_prompt": "Add a 2 second delay before the game starts.",
"expected_results": [
{
"type": "assert_exists_in_source_code",
"source_file": "game.py",
"existing_string": "time.sleep(2)",
}
],
},
]
# Not supporting execution paths that used to exist
# evaluations = [
# {
# "name": "currency_converter",
# "project_root": "projects/currency_converter",
# "code_prompt": "Build a currency converter CLI tool in Python using an API for exchange rates. The currency converter should be a python program named currency.py with three required arguments: base currency symbol, target currency symbol and base currency amount. The currency converter will convert the amount in base currency amount to the target currency. The output of the program should only be the amount of target currency. For example the following command: `python currency.py USD CNY 1` should return a number like 7.5.",
# "expected_results": [
# {
# "type": "check_executable_exits_normally",
# "executable_name": "python currency.py",
# "executable_arguments": "USD CAD 10"
# },
# {
# "type": "check_executable_satisfies_function",
# "executable_name": "python currency.py",
# "executable_arguments": "USD CAD 10",
# "output_satisfies": "tf = lambda a : a.replace('.', '').isnumeric()"
# }
# ]
# },
# {
# "name": "password_gen",
# "project_root": "projects/password_gen_eval",
# "code_prompt": "Create a password generator CLI tool in Python that generates strong, random passwords based on user-specified criteria, such as length and character types (letters, numbers, symbols). The password generator should be a python program named passwordgenerator.py with two arguments: length, and character types. The character types argument can be one or more of the the following: l for lowercase, u for uppercase, d for digits, and s for symbols.",
# "expected_results": [
# {
# "type": "check_executable_exits_normally",
# "executable_name": "python passwordgenerator.py",
# "executable_arguments": "10 d"
# },
# {
# "type": "check_executable_satisfies_function",
# "executable_name": "python passwordgenerator.py",
# "executable_arguments": "10 d",
# "output_satisfies": "tf = lambda a : len(a) == 10"
# }
# ]
# }
# ]
#
def expect_to_assertion(expected_result):
def assertion(assertable: Assertable):
return check_evaluation_component(expected_result, assertable.files)
return assertion
def eval_to_task(case):
if "improve_code_prompt" in case:
prompt = case["improve_code_prompt"]
else:
prompt = case["code_prompt"]
return Task(
name=case["name"],
initial_code=chat_to_files_dict(Path(case["code_blob"]).read_text()),
prompt=prompt,
command=None,
assertions={
f"{e['type']}_{i}": expect_to_assertion(e)
for i, e in enumerate(case["expected_results"])
},
)
def load_gpteng():
return Benchmark(
name="gpte_eval", tasks=[eval_to_task(case) for case in evaluations]
)
|