File size: 14,929 Bytes
d1ceb73 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 |
#!/usr/bin/env python
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .agents import BASE_PYTHON_TOOLS
from .python_interpreter import InterpreterError, evaluate
### Fake tools for test
def classifier(text, labels):
return f"This is the classification of {text} along {labels}."
def translator(text, src_lang, tgt_lang):
return f"This is the translation of {text} from {src_lang} to {tgt_lang}."
def speaker(text):
return f"This is actually a sound reading {text}."
def transcriber(audio):
if "sound" not in audio:
raise ValueError(f"`audio` ({audio}) is not a sound.")
return f"This is the transcribed text from {audio}."
def image_generator(prompt):
return f"This is actually an image representing {prompt}."
def image_captioner(image):
if "image" not in image:
raise ValueError(f"`image` ({image}) is not an image.")
return f"This is a description of {image}."
def image_transformer(image, prompt):
if "image" not in image:
raise ValueError(f"`image` ({image}) is not an image.")
return f"This is a transformation of {image} according to {prompt}."
def question_answerer(text, question):
return f"This is the answer to {question} from {text}."
def image_qa(image, question):
if "image" not in image:
raise ValueError(f"`image` ({image}) is not an image.")
return f"This is the answer to {question} from {image}."
def text_downloader(url):
return f"This is the content of {url}."
def summarizer(text):
return f"This is a summary of {text}."
def video_generator(prompt, seconds=2):
return f"A video of {prompt}"
def document_qa(image, question):
return f"This is the answer to {question} from the document {image}."
def image_segmenter(image, prompt):
return f"This is the mask of {prompt} in {image}"
TEST_TOOLS = {
"text_classifier": classifier,
"translator": translator,
"text_reader": speaker,
"summarizer": summarizer,
"transcriber": transcriber,
"image_generator": image_generator,
"image_captioner": image_captioner,
"image_transformer": image_transformer,
"text_qa": question_answerer,
"text_downloader": text_downloader,
"image_qa": image_qa,
"video_generator": video_generator,
"document_qa": document_qa,
"image_segmenter": image_segmenter,
}
class Problem:
"""
A class regrouping all the information to solve a problem on which we will evaluate agents.
Args:
task (`str` ou `list[str]`):
One or several descriptions of the task to perform. If a list, it should contain variations on the
phrasing, but for the same task.
inputs (`list[str]` or `dict[str, str]`):
The inputs that will be fed to the tools. For this testing environment, only strings are accepted as
values. Pass along a dictionary when you want to specify the values of each inputs, or just the list of
inputs expected (the value used will be `<<input_name>>` in this case).
answer (`str` or `list[str]`):
The theoretical answer (or list of possible valid answers) to the problem, as code.
"""
def __init__(self, task, inputs, answer):
self.task = task
self.inputs = inputs
self.answer = answer
### The list of problems the agent will be evaluated on.
EVALUATION_TASKS = [
Problem(
task=[
"Is the following `text` (in Spanish) positive or negative?",
"Is the text in the variable `text` (in Spanish) positive or negative?",
"Translate the following `text` from Spanish to English then tell me if its positive or negative.",
],
inputs=["text"],
answer="""text_classifier(translator(text, src_lang="Spanish", tgt_lang="English"), labels=["positive", "negative"])""",
),
Problem(
task=[
"Tell me out loud what the `image` contains.",
"Describe the following `image` out loud.",
"Find what is in the picture stored in `image` then read it out loud.",
],
inputs=["image"],
answer=[
"text_reader(image_captioner(image))",
"text_reader(image_qa(image, question='What is in the image?'))",
],
),
Problem(
task=[
"Generate an image from the text given in `text_input`. Then transform it according to the text in `prompt`.",
"Use the following `text_input` to generate an image, then transform it by using the text in `prompt`.",
],
inputs=["text_input", "prompt"],
answer="image_transformer(image_generator(text_input), prompt)",
),
Problem(
task=[
"Download the content of `url`, summarize it then generate an image from its content.",
"Use a summary of the web page at `url` to generate an image.",
"Summarize the content of the web page at `url`, and use the result to generate an image.",
],
inputs=["url"],
answer="image_generator(summarizer(text_downloader(url)))",
),
Problem(
task=[
"Transform the following `image` using the prompt in `text`. The prompt is in Spanish.",
"Use the text prompt in `text` (in Spanish) to transform the following `image`.",
"Translate the `text` from Spanish to English then use it to transform the picture in `image`.",
],
inputs=["text", "image"],
answer="image_transformer(image, translator(text, src_lang='Spanish', tgt_lang='English'))",
),
Problem(
task=[
"Download the content of `url`, summarize it then read it out loud to me.",
"Read me a summary of the web page at `url`.",
],
inputs=["url"],
answer="text_reader(summarizer(text_downloader(url)))",
),
Problem(
task=[
"Generate an image from the text given in `text_input`.",
],
inputs=["text_input"],
answer="image_generator(text_input)",
),
Problem(
task=[
"Replace the beaver in the `image` by the `prompt`.",
"Transform the `image` so that it contains the `prompt`.",
"Use `prompt` to transform this `image`.",
],
inputs=["image", "prompt"],
answer="image_transformer(image, prompt)",
),
Problem(
task=[
"Provide me the summary of the `text`, then read it to me before transcribing it and translating it in French.",
"Summarize `text`, read it out loud then transcribe the audio and translate it in French.",
"Read me a summary of the `text` out loud. Transcribe this and translate it in French.",
],
inputs=["text"],
answer="translator(transcriber(text_reader(summarizer(text))), src_lang='English', tgt_lang='French')",
),
Problem(
task=["Generate a video of the `prompt`", "Animate a `prompt`", "Make me a short video using `prompt`."],
inputs={"prompt": "A lobster swimming"},
answer="video_generator('A lobster swimming')",
),
Problem(
task=[
"Download the following file `url`, summarize it in a few words and generate a video from it."
"Fetch the file at this `url`, summarize it, and create an animation out of it."
],
inputs=["url"],
answer="video_generator(summarizer(text_downloader(url)))",
),
]
def get_theoretical_tools(agent_answer, theoretical_answer, code_answer):
if not isinstance(theoretical_answer, list):
return {name for name in TEST_TOOLS if name in code_answer}
if isinstance(agent_answer, dict):
for one_answer, one_code in zip(theoretical_answer, code_answer):
if one_answer in agent_answer.values():
return {name for name in TEST_TOOLS if name in one_code}
for one_answer, one_code in zip(theoretical_answer, code_answer):
if agent_answer == one_answer:
return {name for name in TEST_TOOLS if name in one_code}
return {name for name in TEST_TOOLS if name in code_answer[0]}
def evaluate_code(code, inputs=None, state=None, verbose=False, return_interpretor_error=False):
tools = BASE_PYTHON_TOOLS.copy()
for name, tool in TEST_TOOLS.items():
if name not in code:
continue
tools[name] = tool
if isinstance(inputs, dict):
inputs = inputs.copy()
elif inputs is not None:
inputs = {inp: f"<<{inp}>>" for inp in inputs}
if state is not None:
state.update(inputs)
else:
state = inputs
try:
return evaluate(code, tools, state)
except InterpreterError as e:
return str(e)
except Exception as e:
if verbose:
print(e)
return None
def score_code(agent_answer, theoretical_answer, verbose: bool = False):
if verbose:
print(agent_answer, theoretical_answer)
theoretical_answer = theoretical_answer if isinstance(theoretical_answer, list) else [theoretical_answer]
if agent_answer in theoretical_answer:
if verbose:
print("Perfect!")
return 1
elif isinstance(agent_answer, dict) and any(v in theoretical_answer for v in agent_answer.values()):
if verbose:
print("Almsot perfect, result in state!")
return 0.75
else:
if verbose:
print("Result is not the right one but code executed.")
return 0.3
def evaluate_one_result(code, agent_answer, theoretical_answer, answer, verbose=False):
tools_in_code = {name for name in TEST_TOOLS if f"`{name}`" in code}
theoretical_tools = get_theoretical_tools(agent_answer, theoretical_answer, answer)
if tools_in_code == theoretical_tools:
tool_selection_score = 1.0
tool_selection_errors = None
else:
missing_tools = len(theoretical_tools - tools_in_code)
unexpected_tools = len(tools_in_code - theoretical_tools)
tool_selection_score = max(0, 1.0 - 0.25 * missing_tools - 0.25 * unexpected_tools)
tool_selection_errors = {
"selected_tools": tools_in_code,
"theoretical_tools": theoretical_tools,
}
tools_in_code = {name for name in TEST_TOOLS if name in code}
if tools_in_code == theoretical_tools:
tool_used_score = 1.0
tool_used_errors = None
else:
missing_tools = len(theoretical_tools - tools_in_code)
unexpected_tools = len(tools_in_code - theoretical_tools)
tool_used_score = max(0, 1.0 - 0.25 * missing_tools - 0.25 * unexpected_tools)
tool_used_errors = {
"selected_tools": tools_in_code,
"theoretical_tools": theoretical_tools,
}
score = score_code(agent_answer, theoretical_answer, verbose=verbose)
if score < 1.0:
code_errors = {
"code_produced": code,
"evaluation": agent_answer,
"theoretical_answer": theoretical_answer,
}
else:
code_errors = None
return (tool_selection_score, tool_used_score, score), (tool_selection_errors, tool_used_errors, code_errors)
def evaluate_agent(agent, batch_size=8, verbose=False, return_errors=False):
"""
Evaluates a new agent on all `EVALUATION_TASKS`.
Example:
```py
agent = NewOpenAiAgent(model="text-davinci-003", api_key=your_api_key)
bads = new_evaluate_agent(agent)
for bad in bads:
print(bad)
```
"""
# Sanity check
agent_tools = set(agent.toolbox.keys())
if agent_tools != set(TEST_TOOLS):
missing_tools = set(TEST_TOOLS) - agent_tools
unexpected_tools = set(agent_tools) - TEST_TOOLS
raise ValueError(
f"Fix the test tools in the evaluate_agent module. Tools mising: {missing_tools}. Extra tools: {unexpected_tools}."
)
eval_tasks = []
eval_idx = []
for idx, pb in enumerate(EVALUATION_TASKS):
if isinstance(pb.task, list):
eval_tasks.extend(pb.task)
eval_idx.extend([idx] * len(pb.task))
else:
eval_tasks.append(pb.task)
eval_idx.append(idx)
tool_selection_score = 0
tool_used_score = 0
code_score = 0
if return_errors:
tool_selection_errors = {}
tool_used_errors = {}
code_errors = {}
for start_idx in range(0, len(eval_tasks), batch_size):
end_idx = min(start_idx + batch_size, len(eval_tasks))
batch_tasks = eval_tasks[start_idx:end_idx]
results = [agent.run(task, return_generated_code=True) for task in batch_tasks]
for idx, result in enumerate(results):
problem = EVALUATION_TASKS[eval_idx[start_idx + idx]]
if verbose:
print(f"====Task {start_idx + idx}====\n{batch_tasks[idx]}\n")
code = agent.extract_action(result, split_token="Answer:")
# Evaluate agent answer and code answer
agent_answer = evaluate_code(code, problem.inputs, verbose=verbose)
if isinstance(problem.answer, list):
theoretical_answer = [evaluate_code(answer, problem.inputs) for answer in problem.answer]
else:
theoretical_answer = evaluate_code(problem.answer, problem.inputs)
scores, errors = evaluate_one_result(
code, agent_answer, theoretical_answer, problem.answer, verbose=verbose
)
tool_selection_score += scores[0]
tool_used_score += scores[1]
code_score += scores[2]
if return_errors:
if errors[0] is not None:
tool_selection_errors[batch_tasks[idx]] = errors[0]
if errors[1] is not None:
tool_used_errors[batch_tasks[idx]] = errors[1]
if errors[2] is not None:
code_errors[batch_tasks[idx]] = errors[2]
scores = {
"tool selection score": 100 * (tool_selection_score / len(eval_tasks)),
"tool used score": 100 * (tool_used_score / len(eval_tasks)),
"code score": 100 * (code_score / len(eval_tasks)),
}
if return_errors:
return scores, tool_selection_errors, tool_used_errors, code_errors
else:
return scores
|