File size: 5,663 Bytes
51ff9e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#!/usr/bin/env bash

##################################################################################################
# Adapted from https://github.com/TheAgentCompany/TheAgentCompany/blob/main/evaluation/run_eval.sh
##################################################################################################

# Exit on any error would be useful for debugging
if [ -n "$DEBUG" ]; then
    set -e
fi

# AGENT_LLM_CONFIG is the config name for the agent LLM
# In config.toml, you should have a section with the name
# [llm.<AGENT_LLM_CONFIG>], e.g. [llm.agent]
AGENT_LLM_CONFIG="agent"

# ENV_LLM_CONFIG is the config name for the environment LLM,
# used by the NPCs and LLM-based evaluators.
# In config.toml, you should have a section with the name
# [llm.<ENV_LLM_CONFIG>], e.g. [llm.env]
ENV_LLM_CONFIG="env"

# OUTPUTS_PATH is the path to save trajectories and evaluation results
OUTPUTS_PATH="outputs"

# SERVER_HOSTNAME is the hostname of the server that hosts all the web services,
# including RocketChat, ownCloud, GitLab, and Plane.
SERVER_HOSTNAME="localhost"

# VERSION is the version of the task images to use
# If a task doesn't have a published image with this version, it will be skipped
# 12/15/2024: this is for forward compatibility, in the case where we add new tasks
# after the 1.0.0 release
VERSION="1.0.0"

# Parse command line arguments
while [[ $# -gt 0 ]]; do
    case "$1" in
        --agent-llm-config)
            AGENT_LLM_CONFIG="$2"
            shift 2
            ;;
        --env-llm-config)
            ENV_LLM_CONFIG="$2"
            shift 2
            ;;
        --agent-config)
            AGENT_CONFIG="$2"
            shift 2
            ;;
        --outputs-path)
            OUTPUTS_PATH="$2"
            shift 2
            ;;
        --server-hostname)
            SERVER_HOSTNAME="$2"
            shift 2
            ;;
        --version)
            VERSION="$2"
            shift 2
            ;;
        --start-percentile)
            START_PERCENTILE="$2"
            shift 2
            ;;
        --end-percentile)
            END_PERCENTILE="$2"
            shift 2
            ;;
        *)
            echo "Unknown argument: $1"
            exit 1
            ;;
    esac
done

# Convert outputs_path to absolute path
if [[ ! "$OUTPUTS_PATH" = /* ]]; then
    # If path is not already absolute (doesn't start with /), make it absolute
    OUTPUTS_PATH="$(cd "$(dirname "$OUTPUTS_PATH")" 2>/dev/null && pwd)/$(basename "$OUTPUTS_PATH")"
fi

: "${START_PERCENTILE:=0}"  # Default to 0 percentile (first line)
: "${END_PERCENTILE:=100}"  # Default to 100 percentile (last line)

# Validate percentile ranges if provided
if ! [[ "$START_PERCENTILE" =~ ^[0-9]+$ ]] || ! [[ "$END_PERCENTILE" =~ ^[0-9]+$ ]]; then
    echo "Error: Percentiles must be integers"
    exit 1
fi

if [ "$START_PERCENTILE" -ge "$END_PERCENTILE" ]; then
    echo "Error: Start percentile must be less than end percentile"
    exit 1
fi

if [ "$START_PERCENTILE" -lt 0 ] || [ "$END_PERCENTILE" -gt 100 ]; then
    echo "Error: Percentiles must be between 0 and 100"
    exit 1
fi

echo "Using agent LLM config: $AGENT_LLM_CONFIG"
echo "Using environment LLM config: $ENV_LLM_CONFIG"
echo "Outputs path: $OUTPUTS_PATH"
echo "Server hostname: $SERVER_HOSTNAME"
echo "Version: $VERSION"
echo "Start Percentile: $START_PERCENTILE"
echo "End Percentile: $END_PERCENTILE"

echo "Downloading tasks.md..."
rm -f tasks.md
wget https://github.com/TheAgentCompany/TheAgentCompany/releases/download/${VERSION}/tasks.md

total_lines=$(cat tasks.md | grep "ghcr.io/theagentcompany" | wc -l)
if [ "$total_lines" -ne 175 ]; then
    echo "Error: Expected 175 tasks in tasks.md but found $total_lines lines"
    exit 1
fi

# Calculate line numbers based on percentiles
start_line=$(echo "scale=0; ($total_lines * $START_PERCENTILE / 100) + 1" | bc)
end_line=$(echo "scale=0; $total_lines * $END_PERCENTILE / 100" | bc)

echo "Using tasks No. $start_line to $end_line (inclusive) out of 1-175 tasks"

# Create a temporary file with just the desired range
temp_file="tasks_${START_PERCENTILE}_${END_PERCENTILE}.md"
sed -n "${start_line},${end_line}p" tasks.md > "$temp_file"

while IFS= read -r task_image; do
    # Remove prefix using ## to remove longest matching pattern from start
    task_name=${task_image##ghcr.io/theagentcompany/}

    # Remove suffix using % to remove shortest matching pattern from end
    task_name=${task_name%-image:*}
    echo "Use task image $task_image, task name $task_name..."

    # Check if evaluation file exists
    if [ -f "$OUTPUTS_PATH/eval_${task_name}-image.json" ]; then
        echo "Skipping $task_name - evaluation file already exists"
        continue
    fi

    docker pull $task_image

    # Build the Python command
    COMMAND="poetry run python -m evaluation.benchmarks.the_agent_company.run_infer \
            --agent-llm-config \"$AGENT_LLM_CONFIG\" \
            --env-llm-config \"$ENV_LLM_CONFIG\" \
            --outputs-path \"$OUTPUTS_PATH\" \
            --server-hostname \"$SERVER_HOSTNAME\" \
            --task-image-name \"$task_image\""

    # Add agent-config if it's defined
    if [ -n "$AGENT_CONFIG" ]; then
        COMMAND="$COMMAND --agent-config $AGENT_CONFIG"
    fi

    export PYTHONPATH=evaluation/benchmarks/the_agent_company:$PYTHONPATH && \
        eval "$COMMAND"

    # Prune unused images and volumes
    docker image rm "$task_image"
    docker images "ghcr.io/all-hands-ai/runtime" -q | xargs -r docker rmi -f
    docker volume prune -f
    docker system prune -f
done < "$temp_file"

rm tasks.md "$temp_file"

echo "All evaluation completed successfully!"