File size: 5,570 Bytes
246d201
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/bin/bash

PROCESS_FILEPATH=$1
if [ -z "$PROCESS_FILEPATH" ]; then
    echo "Error: PROCESS_FILEPATH is empty. Usage: ./eval_infer.sh <output_file> [instance_id] [dataset_name] [split]"
    exit 1
fi

if [ ! -f $PROCESS_FILEPATH ]; then
    echo "Error: $PROCESS_FILEPATH is not a file"
    exit 1
fi

# If instance_id is empty, it means we want to eval on the whole $PROCESS_FILEPATH
# otherwise, we want to eval on the instance_id
INSTANCE_ID=$2
DATASET_NAME=${3:-"princeton-nlp/SWE-bench_Lite"}
SPLIT=${4:-"test"}

echo "INSTANCE_ID: $INSTANCE_ID"
echo "DATASET_NAME: $DATASET_NAME"
echo "SPLIT: $SPLIT"

PROCESS_FILEPATH=$(realpath $PROCESS_FILEPATH)
FILE_DIR=$(dirname $PROCESS_FILEPATH)
FILE_NAME=$(basename $PROCESS_FILEPATH)

echo "Evaluating $FILE_NAME @ $FILE_DIR"

# ================================================
# detect whether PROCESS_FILEPATH is in OH format or in SWE-bench format
echo "=============================================================="
echo "Detecting whether PROCESS_FILEPATH is in OH format or in SWE-bench format"
echo "=============================================================="
# SWE-bench format is a JSONL where every line has three fields: model_name_or_path, instance_id, and model_patch
function is_swebench_format() {
    # Read the first line of the file
    read -r first_line < "$PROCESS_FILEPATH"

    # Use jq to check if the first line has the required fields
    echo "$first_line" | jq -e '. | has("model_name_or_path") and has("instance_id") and has("model_patch")' > /dev/null

    if [ $? -ne 0 ]; then
        return 1 # Return 1 if the first line does not have the required fields
    fi

    return 0 # Return 0 if the first line has the required fields
}
# Call the function with the file path
is_swebench_format "$PROCESS_FILEPATH"
IS_SWEBENCH_FORMAT=$?
# Use the result in an if-else statement
if [ $IS_SWEBENCH_FORMAT -eq 0 ]; then
    echo "The file IS in SWE-bench format."
    SWEBENCH_FORMAT_JSONL=$PROCESS_FILEPATH
else
    echo "The file IS NOT in SWE-bench format."

    # ==== Convert OH format to SWE-bench format ====
    echo "Merged output file with fine-grained report will be saved to $FILE_DIR"
    poetry run python3 evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $PROCESS_FILEPATH
    # replace .jsonl with .swebench.jsonl in filename
    SWEBENCH_FORMAT_JSONL=${PROCESS_FILEPATH/.jsonl/.swebench.jsonl}
    echo "SWEBENCH_FORMAT_JSONL: $SWEBENCH_FORMAT_JSONL"
    # assert that the file exists
    if [ ! -f $SWEBENCH_FORMAT_JSONL ]; then
        echo "Error: $SWEBENCH_FORMAT_JSONL does not exist. There is probably an error in the conversion process."
        exit 1
    fi
    SWEBENCH_FORMAT_JSONL=$(realpath $SWEBENCH_FORMAT_JSONL)
fi
# ================================================

echo "=============================================================="
echo "Running SWE-bench evaluation"
echo "=============================================================="

RUN_ID=$(date +"%Y%m%d_%H%M%S")
N_PROCESS=16

if [ -z "$INSTANCE_ID" ]; then
    echo "Running SWE-bench evaluation on the whole input file..."
    # Default to SWE-Bench-lite
    # change `--dataset_name` and `--split` to alter dataset

    poetry run python -m swebench.harness.run_evaluation \
        --dataset_name "$DATASET_NAME" \
        --split "$SPLIT" \
        --predictions_path $SWEBENCH_FORMAT_JSONL \
        --timeout 1800 \
        --cache_level instance \
        --max_workers $N_PROCESS \
        --run_id $RUN_ID

    # get the "model_name_or_path" from the first line of the SWEBENCH_FORMAT_JSONL
    MODEL_NAME_OR_PATH=$(jq -r '.model_name_or_path' $SWEBENCH_FORMAT_JSONL | head -n 1)
    echo "MODEL_NAME_OR_PATH: $MODEL_NAME_OR_PATH"

    RESULT_OUTPUT_DIR=$(dirname $SWEBENCH_FORMAT_JSONL)
    echo "RESULT_OUTPUT_DIR: $RESULT_OUTPUT_DIR"

    # move the eval results to the target directory
    mkdir -p $RESULT_OUTPUT_DIR
    # rm eval_outputs directory if it exists
    if [ -d $RESULT_OUTPUT_DIR/eval_outputs ]; then
        rm -rf $RESULT_OUTPUT_DIR/eval_outputs
    fi

    mv logs/run_evaluation/$RUN_ID/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR
    mv $RESULT_OUTPUT_DIR/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR/eval_outputs
    echo "RUN_ID: $RUN_ID" > $RESULT_OUTPUT_DIR/run_id.txt

    # move report file
    REPORT_PATH=$MODEL_NAME_OR_PATH.$RUN_ID.json
    if [ -f $REPORT_PATH ]; then
        # check if $RESULT_OUTPUT_DIR/report.json exists
        if [ -f $RESULT_OUTPUT_DIR/report.json ]; then
            echo "Report file $RESULT_OUTPUT_DIR/report.json already exists. Overwriting..."
            if [ -f $RESULT_OUTPUT_DIR/report.json.bak ]; then
                rm $RESULT_OUTPUT_DIR/report.json.bak
            fi
            mv $RESULT_OUTPUT_DIR/report.json $RESULT_OUTPUT_DIR/report.json.bak
        fi

        mv $REPORT_PATH $RESULT_OUTPUT_DIR/report.json
    fi

    poetry run python evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $PROCESS_FILEPATH

else
    echo "Running SWE-bench evaluation on the instance_id: $INSTANCE_ID"
    poetry run python -m swebench.harness.run_evaluation \
        --dataset_name "$DATASET_NAME" \
        --split "$SPLIT" \
        --predictions_path $SWEBENCH_FORMAT_JSONL \
        --timeout 1800 \
        --instance_ids $INSTANCE_ID \
        --cache_level instance \
        --max_workers $N_PROCESS \
        --run_id $RUN_ID
fi