File size: 14,929 Bytes
d1ceb73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
#!/usr/bin/env python
# coding=utf-8

# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .agents import BASE_PYTHON_TOOLS
from .python_interpreter import InterpreterError, evaluate


### Fake tools for test
def classifier(text, labels):
    return f"This is the classification of {text} along {labels}."


def translator(text, src_lang, tgt_lang):
    return f"This is the translation of {text} from {src_lang} to {tgt_lang}."


def speaker(text):
    return f"This is actually a sound reading {text}."


def transcriber(audio):
    if "sound" not in audio:
        raise ValueError(f"`audio` ({audio}) is not a sound.")
    return f"This is the transcribed text from {audio}."


def image_generator(prompt):
    return f"This is actually an image representing {prompt}."


def image_captioner(image):
    if "image" not in image:
        raise ValueError(f"`image` ({image}) is not an image.")
    return f"This is a description of {image}."


def image_transformer(image, prompt):
    if "image" not in image:
        raise ValueError(f"`image` ({image}) is not an image.")
    return f"This is a transformation of {image} according to {prompt}."


def question_answerer(text, question):
    return f"This is the answer to {question} from {text}."


def image_qa(image, question):
    if "image" not in image:
        raise ValueError(f"`image` ({image}) is not an image.")
    return f"This is the answer to {question} from {image}."


def text_downloader(url):
    return f"This is the content of {url}."


def summarizer(text):
    return f"This is a summary of {text}."


def video_generator(prompt, seconds=2):
    return f"A video of {prompt}"


def document_qa(image, question):
    return f"This is the answer to {question} from the document {image}."


def image_segmenter(image, prompt):
    return f"This is the mask of {prompt} in {image}"


TEST_TOOLS = {
    "text_classifier": classifier,
    "translator": translator,
    "text_reader": speaker,
    "summarizer": summarizer,
    "transcriber": transcriber,
    "image_generator": image_generator,
    "image_captioner": image_captioner,
    "image_transformer": image_transformer,
    "text_qa": question_answerer,
    "text_downloader": text_downloader,
    "image_qa": image_qa,
    "video_generator": video_generator,
    "document_qa": document_qa,
    "image_segmenter": image_segmenter,
}


class Problem:
    """
    A class regrouping all the information to solve a problem on which we will evaluate agents.

    Args:
        task (`str` ou `list[str]`):
            One or several descriptions of the task to perform. If a list, it should contain variations on the
            phrasing, but for the same task.
        inputs (`list[str]` or `dict[str, str]`):
            The inputs that will be fed to the tools. For this testing environment, only strings are accepted as
            values. Pass along a dictionary when you want to specify the values of each inputs, or just the list of
            inputs expected (the value used will be `<<input_name>>` in this case).
        answer (`str` or `list[str]`):
            The theoretical answer (or list of possible valid answers) to the problem, as code.
    """

    def __init__(self, task, inputs, answer):
        self.task = task
        self.inputs = inputs
        self.answer = answer


### The list of problems the agent will be evaluated on.
EVALUATION_TASKS = [
    Problem(
        task=[
            "Is the following `text` (in Spanish) positive or negative?",
            "Is the text in the variable `text` (in Spanish) positive or negative?",
            "Translate the following `text` from Spanish to English then tell me if its positive or negative.",
        ],
        inputs=["text"],
        answer="""text_classifier(translator(text, src_lang="Spanish", tgt_lang="English"), labels=["positive", "negative"])""",
    ),
    Problem(
        task=[
            "Tell me out loud what the `image` contains.",
            "Describe the following `image` out loud.",
            "Find what is in the picture stored in `image` then read it out loud.",
        ],
        inputs=["image"],
        answer=[
            "text_reader(image_captioner(image))",
            "text_reader(image_qa(image, question='What is in the image?'))",
        ],
    ),
    Problem(
        task=[
            "Generate an image from the text given in `text_input`. Then transform it according to the text in `prompt`.",
            "Use the following `text_input` to generate an image, then transform it by using the text in `prompt`.",
        ],
        inputs=["text_input", "prompt"],
        answer="image_transformer(image_generator(text_input), prompt)",
    ),
    Problem(
        task=[
            "Download the content of `url`, summarize it then generate an image from its content.",
            "Use a summary of the web page at `url` to generate an image.",
            "Summarize the content of the web page at `url`, and use the result to generate an image.",
        ],
        inputs=["url"],
        answer="image_generator(summarizer(text_downloader(url)))",
    ),
    Problem(
        task=[
            "Transform the following `image` using the prompt in `text`. The prompt is in Spanish.",
            "Use the text prompt in `text` (in Spanish) to transform the following `image`.",
            "Translate the `text` from Spanish to English then use it to transform the picture in `image`.",
        ],
        inputs=["text", "image"],
        answer="image_transformer(image, translator(text, src_lang='Spanish', tgt_lang='English'))",
    ),
    Problem(
        task=[
            "Download the content of `url`, summarize it then read it out loud to me.",
            "Read me a summary of the web page at `url`.",
        ],
        inputs=["url"],
        answer="text_reader(summarizer(text_downloader(url)))",
    ),
    Problem(
        task=[
            "Generate an image from the text given in `text_input`.",
        ],
        inputs=["text_input"],
        answer="image_generator(text_input)",
    ),
    Problem(
        task=[
            "Replace the beaver in the `image` by the `prompt`.",
            "Transform the `image` so that it contains the `prompt`.",
            "Use `prompt` to transform this `image`.",
        ],
        inputs=["image", "prompt"],
        answer="image_transformer(image, prompt)",
    ),
    Problem(
        task=[
            "Provide me the summary of the `text`, then read it to me before transcribing it and translating it in French.",
            "Summarize `text`, read it out loud then transcribe the audio and translate it in French.",
            "Read me a summary of the `text` out loud. Transcribe this and translate it in French.",
        ],
        inputs=["text"],
        answer="translator(transcriber(text_reader(summarizer(text))), src_lang='English', tgt_lang='French')",
    ),
    Problem(
        task=["Generate a video of the `prompt`", "Animate a `prompt`", "Make me a short video using `prompt`."],
        inputs={"prompt": "A lobster swimming"},
        answer="video_generator('A lobster swimming')",
    ),
    Problem(
        task=[
            "Download the following file `url`, summarize it in a few words and generate a video from it."
            "Fetch the file at this `url`, summarize it, and create an animation out of it."
        ],
        inputs=["url"],
        answer="video_generator(summarizer(text_downloader(url)))",
    ),
]


def get_theoretical_tools(agent_answer, theoretical_answer, code_answer):
    if not isinstance(theoretical_answer, list):
        return {name for name in TEST_TOOLS if name in code_answer}

    if isinstance(agent_answer, dict):
        for one_answer, one_code in zip(theoretical_answer, code_answer):
            if one_answer in agent_answer.values():
                return {name for name in TEST_TOOLS if name in one_code}

    for one_answer, one_code in zip(theoretical_answer, code_answer):
        if agent_answer == one_answer:
            return {name for name in TEST_TOOLS if name in one_code}

    return {name for name in TEST_TOOLS if name in code_answer[0]}


def evaluate_code(code, inputs=None, state=None, verbose=False, return_interpretor_error=False):
    tools = BASE_PYTHON_TOOLS.copy()
    for name, tool in TEST_TOOLS.items():
        if name not in code:
            continue
        tools[name] = tool

    if isinstance(inputs, dict):
        inputs = inputs.copy()
    elif inputs is not None:
        inputs = {inp: f"<<{inp}>>" for inp in inputs}

    if state is not None:
        state.update(inputs)
    else:
        state = inputs

    try:
        return evaluate(code, tools, state)
    except InterpreterError as e:
        return str(e)
    except Exception as e:
        if verbose:
            print(e)
        return None


def score_code(agent_answer, theoretical_answer, verbose: bool = False):
    if verbose:
        print(agent_answer, theoretical_answer)
    theoretical_answer = theoretical_answer if isinstance(theoretical_answer, list) else [theoretical_answer]

    if agent_answer in theoretical_answer:
        if verbose:
            print("Perfect!")
        return 1
    elif isinstance(agent_answer, dict) and any(v in theoretical_answer for v in agent_answer.values()):
        if verbose:
            print("Almsot perfect, result in state!")
        return 0.75
    else:
        if verbose:
            print("Result is not the right one but code executed.")
        return 0.3


def evaluate_one_result(code, agent_answer, theoretical_answer, answer, verbose=False):
    tools_in_code = {name for name in TEST_TOOLS if f"`{name}`" in code}
    theoretical_tools = get_theoretical_tools(agent_answer, theoretical_answer, answer)
    if tools_in_code == theoretical_tools:
        tool_selection_score = 1.0
        tool_selection_errors = None
    else:
        missing_tools = len(theoretical_tools - tools_in_code)
        unexpected_tools = len(tools_in_code - theoretical_tools)
        tool_selection_score = max(0, 1.0 - 0.25 * missing_tools - 0.25 * unexpected_tools)

        tool_selection_errors = {
            "selected_tools": tools_in_code,
            "theoretical_tools": theoretical_tools,
        }

    tools_in_code = {name for name in TEST_TOOLS if name in code}
    if tools_in_code == theoretical_tools:
        tool_used_score = 1.0
        tool_used_errors = None
    else:
        missing_tools = len(theoretical_tools - tools_in_code)
        unexpected_tools = len(tools_in_code - theoretical_tools)
        tool_used_score = max(0, 1.0 - 0.25 * missing_tools - 0.25 * unexpected_tools)

        tool_used_errors = {
            "selected_tools": tools_in_code,
            "theoretical_tools": theoretical_tools,
        }

    score = score_code(agent_answer, theoretical_answer, verbose=verbose)
    if score < 1.0:
        code_errors = {
            "code_produced": code,
            "evaluation": agent_answer,
            "theoretical_answer": theoretical_answer,
        }
    else:
        code_errors = None

    return (tool_selection_score, tool_used_score, score), (tool_selection_errors, tool_used_errors, code_errors)


def evaluate_agent(agent, batch_size=8, verbose=False, return_errors=False):
    """
    Evaluates a new agent on all `EVALUATION_TASKS`.

    Example:

    ```py
    agent = NewOpenAiAgent(model="text-davinci-003", api_key=your_api_key)
    bads = new_evaluate_agent(agent)
    for bad in bads:
        print(bad)
    ```
    """
    # Sanity check
    agent_tools = set(agent.toolbox.keys())
    if agent_tools != set(TEST_TOOLS):
        missing_tools = set(TEST_TOOLS) - agent_tools
        unexpected_tools = set(agent_tools) - TEST_TOOLS
        raise ValueError(
            f"Fix the test tools in the evaluate_agent module. Tools mising: {missing_tools}. Extra tools: {unexpected_tools}."
        )

    eval_tasks = []
    eval_idx = []
    for idx, pb in enumerate(EVALUATION_TASKS):
        if isinstance(pb.task, list):
            eval_tasks.extend(pb.task)
            eval_idx.extend([idx] * len(pb.task))
        else:
            eval_tasks.append(pb.task)
            eval_idx.append(idx)

    tool_selection_score = 0
    tool_used_score = 0
    code_score = 0

    if return_errors:
        tool_selection_errors = {}
        tool_used_errors = {}
        code_errors = {}

    for start_idx in range(0, len(eval_tasks), batch_size):
        end_idx = min(start_idx + batch_size, len(eval_tasks))
        batch_tasks = eval_tasks[start_idx:end_idx]

        results = [agent.run(task, return_generated_code=True) for task in batch_tasks]

        for idx, result in enumerate(results):
            problem = EVALUATION_TASKS[eval_idx[start_idx + idx]]
            if verbose:
                print(f"====Task {start_idx + idx}====\n{batch_tasks[idx]}\n")
            code = agent.extract_action(result, split_token="Answer:")

            # Evaluate agent answer and code answer
            agent_answer = evaluate_code(code, problem.inputs, verbose=verbose)
            if isinstance(problem.answer, list):
                theoretical_answer = [evaluate_code(answer, problem.inputs) for answer in problem.answer]
            else:
                theoretical_answer = evaluate_code(problem.answer, problem.inputs)

            scores, errors = evaluate_one_result(
                code, agent_answer, theoretical_answer, problem.answer, verbose=verbose
            )

            tool_selection_score += scores[0]
            tool_used_score += scores[1]
            code_score += scores[2]

            if return_errors:
                if errors[0] is not None:
                    tool_selection_errors[batch_tasks[idx]] = errors[0]
                if errors[1] is not None:
                    tool_used_errors[batch_tasks[idx]] = errors[1]
                if errors[2] is not None:
                    code_errors[batch_tasks[idx]] = errors[2]

    scores = {
        "tool selection score": 100 * (tool_selection_score / len(eval_tasks)),
        "tool used score": 100 * (tool_used_score / len(eval_tasks)),
        "code score": 100 * (code_score / len(eval_tasks)),
    }

    if return_errors:
        return scores, tool_selection_errors, tool_used_errors, code_errors
    else:
        return scores