File size: 2,024 Bytes
94011a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/bin/bash

# 定义参数列表
test_perturbations=("reverse_full")
checkpoints=("checkpoint-500" "checkpoint-1000" "checkpoint-1500" "checkpoint-2000" "checkpoint-2500" "checkpoint-3000"
"checkpoint-3500" "checkpoint-4000" "checkpoint-4500" "checkpoint-5000" "checkpoint-5500" "checkpoint-6000" 
"checkpoint-6500" "checkpoint-7000" "checkpoint-7500" "checkpoint-8000" "checkpoint-8500" "checkpoint-9000" 
"checkpoint-9500" "checkpoint-10000" "checkpoint-11500")
random_seeds=(1 2 3 4 5)
gpus=(1 2 3 4 5 6 7) # 使用指定的 GPU

# 初始化任务索引
task_index=0
total_combinations=$((${#test_perturbations[@]} * ${#checkpoints[@]} * ${#random_seeds[@]}))

# 检查指定 GPU 是否空闲
is_gpu_free() {
    gpu_id=$1
    utilization=$(nvidia-smi -i $gpu_id --query-gpu=utilization.gpu --format=csv,noheader,nounits)
    if [ "$utilization" -lt 10 ]; then
        return 0 # GPU is free
    else
        return 1 # GPU is busy
    fi
}

# 获取参数组合的函数
get_next_task() {
    perturbation=${test_perturbations[$((task_index % ${#test_perturbations[@]}))]}
    checkpoint=${checkpoints[$(((task_index / ${#test_perturbations[@]}) % ${#checkpoints[@]}))]}
    seed=${random_seeds[$(((task_index / (${#test_perturbations[@]} * ${#checkpoints[@]})) % ${#random_seeds[@]}))]}
}

# 主循环
while [ $task_index -lt $total_combinations ]; do
    for gpu in "${gpus[@]}"; do
        if is_gpu_free $gpu; then
            get_next_task

            echo "Running experiment for $perturbation, $checkpoint, seed $seed on GPU $gpu"
            CUDA_VISIBLE_DEVICES=$gpu python perplexities_qwen.py "$perturbation" "$checkpoint" "$seed" &

            # 增加任务索引
            task_index=$((task_index + 1))

            # 启动下一个任务或退出
            if [ $task_index -ge $total_combinations ]; then
                break
            fi
        fi
    done

    # 等待当前所有后台任务结束(每个 GPU 一个任务)
    wait
    sleep 5  # 短暂睡眠后继续检查 GPU
done