|
import importlib |
|
|
|
from typing import Annotated, Optional |
|
|
|
import typer |
|
|
|
from langchain.cache import SQLiteCache |
|
from langchain.globals import set_llm_cache |
|
|
|
from gpt_engineer.benchmark.benchmarks.load import get_benchmark |
|
from gpt_engineer.benchmark.run import print_results, run |
|
|
|
|
|
def get_agent(path): |
|
|
|
agent_module = importlib.import_module(path.replace("/", ".").replace(".py", "")) |
|
return agent_module.default_config_agent() |
|
|
|
|
|
def main( |
|
path_to_agent: Annotated[ |
|
str, |
|
typer.Argument( |
|
help="python file that contains a function called 'default_config_agent'" |
|
), |
|
], |
|
benchmarks: Annotated[ |
|
str, typer.Argument(help="benchmark name(s) separated by ','") |
|
], |
|
task_name: Annotated[ |
|
Optional[str], typer.Argument(help="optional task name in benchmark") |
|
] = None, |
|
verbose: Annotated[ |
|
bool, typer.Option(help="print results for each task", show_default=False) |
|
] = False, |
|
): |
|
set_llm_cache(SQLiteCache(database_path=".langchain.db")) |
|
|
|
benchmarks = benchmarks.split(",") |
|
for benchmark_name in benchmarks: |
|
benchmark = get_benchmark(benchmark_name) |
|
agent = get_agent(path_to_agent) |
|
|
|
results = run(agent, benchmark, task_name, verbose=verbose) |
|
print( |
|
f"\n--- Results for agent {path_to_agent}, benchmark: {benchmark_name} ---" |
|
) |
|
print_results(results) |
|
print() |
|
|
|
|
|
if __name__ == "__main__": |
|
typer.run(main) |
|
|