orby-agent-v0 (#8)
Browse files- Add results from Orby Agent (e3f1cee9a629f47df2ced6010851da27592c34dd)
- Update README with eval details (a17c3c86b7692201cb86b06bd0dbf9fce5b20b13)
- Update links to research blog (0d9ae7c0eb6d920a8cd892beaefbe3021354ff76)
Co-authored-by: Gang Li <[email protected]>
- results/OrbyAgent-ActIO-72b/README.md +7 -0
- results/OrbyAgent-ActIO-72b/miniwob.json +16 -0
- results/OrbyAgent-ActIO-72b/webarena.json +16 -0
- results/OrbyAgent-Claude-3.5-Sonnet/README.md +7 -0
- results/OrbyAgent-Claude-3.5-Sonnet/miniwob.json +16 -0
- results/OrbyAgent-Claude-3.5-Sonnet/webarena.json +16 -0
results/OrbyAgent-ActIO-72b/README.md
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### OrbyAgent-ActIO-72b
|
2 |
+
|
3 |
+
This agent is developed by [Orby AI](https://www.orby.ai/).
|
4 |
+
|
5 |
+
The agent does not use any benchmark-specific information in the prompts. For WebArena benchmark, we use the original evaluator and task definitions for fair comparison.
|
6 |
+
|
7 |
+
It uses the ActIO model of 72B parameters as a backend, with both screenshot and HTML as inputs. More details can be found in our [research blog](https://www.orby.ai/resources/elevating-automation-orby-ais-generic-agent-framework-and-self-adaptive-interface-learning-technique).
|
results/OrbyAgent-ActIO-72b/miniwob.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "OrbyAgent-ActIO-72b",
|
4 |
+
"study_id": "orby-agent-v0-actio-v0-miniwob",
|
5 |
+
"benchmark": "MiniWoB",
|
6 |
+
"score": 64.2,
|
7 |
+
"std_err": 1.4,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2025-02-21 15:03:35"
|
15 |
+
}
|
16 |
+
]
|
results/OrbyAgent-ActIO-72b/webarena.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "OrbyAgent-ActIO-72b",
|
4 |
+
"study_id": "b5fc5be7-54cc-4fc1-a9ee-73447b9c3eae",
|
5 |
+
"benchmark": "WebArena",
|
6 |
+
"score": 34.7,
|
7 |
+
"std_err": 0.25,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "Use original WebArena eval protocol and task definitions",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2025-02-21 15:05:12"
|
15 |
+
}
|
16 |
+
]
|
results/OrbyAgent-Claude-3.5-Sonnet/README.md
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### OrbyAgent-Claude-3.5-Sonnet
|
2 |
+
|
3 |
+
This agent is developed by [Orby AI](https://www.orby.ai/).
|
4 |
+
|
5 |
+
The agent does not use any benchmark-specific information in the prompts. For WebArena benchmark, we use the original evaluator and task definitions for fair comparison.
|
6 |
+
|
7 |
+
It uses Claude-3.5-sonnet-20241022 as a backend, with both screenshot and HTML as inputs. More details can be found in our [research blog](https://www.orby.ai/resources/elevating-automation-orby-ais-generic-agent-framework-and-self-adaptive-interface-learning-technique).
|
results/OrbyAgent-Claude-3.5-Sonnet/miniwob.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "OrbyAgent-Claude-3.5-Sonnet",
|
4 |
+
"study_id": "orby-agent-v0-claude-3.5-miniwob",
|
5 |
+
"benchmark": "MiniWoB",
|
6 |
+
"score": 74.9,
|
7 |
+
"std_err": 1.2,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2025-02-21 14:54:16"
|
15 |
+
}
|
16 |
+
]
|
results/OrbyAgent-Claude-3.5-Sonnet/webarena.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "OrbyAgent-Claude-3.5-Sonnet",
|
4 |
+
"study_id": "orby-agent-v0-claude-3.5-webarena",
|
5 |
+
"benchmark": "WebArena",
|
6 |
+
"score": 36.5,
|
7 |
+
"std_err": 0,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "Use original WebArena eval protocol and task definitions",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2025-02-21 15:00:22"
|
15 |
+
}
|
16 |
+
]
|