Yixin Liu
upload
564624d
update_device_command = "update_device_idx;\n"
update_device_func = """
function update_device_idx {
if [ $constrain_total = true ]; then
# check total cpu usage
while true; do
cpu_mean_1=$(mpstat -P ALL 1 1 | awk '/Average:/ && $2 ~ /[0-9]/ { cpu_usage=100-$NF; total+=cpu_usage; count++ } END { print total/count }')
sleep 1
cpu_mean_2=$(mpstat -P ALL 1 1 | awk '/Average:/ && $2 ~ /[0-9]/ { cpu_usage=100-$NF; total+=cpu_usage; count++ } END { print total/count }')
sleep 1
cpu_mean_3=$(mpstat -P ALL 1 1 | awk '/Average:/ && $2 ~ /[0-9]/ { cpu_usage=100-$NF; total+=cpu_usage; count++ } END { print total/count }')
cpu_mean=$(echo "scale=2; ($cpu_mean_1+$cpu_mean_2+$cpu_mean_3)/3" | bc)
# if currently cpu usage is less than the threshold, then break
if [ $(echo "$cpu_mean < $cpu_mean_max" | bc) -eq 1 ]; then
echo "total cpu mean: $cpu_mean is less than $cpu_mean_max, continue to check total memory usage"
break
else
echo "total cpu mean: $cpu_mean is greater than $cpu_mean_max, sleep 10 seconds"
sleep 10
fi
done;
# check total memory usage
while true; do
# get memory usage of whole system
mem_used_1=$(free -m | awk '/Mem:/ {print $3}')
sleep 1
mem_used_2=$(free -m | awk '/Mem:/ {print $3}')
sleep 1
mem_used_3=$(free -m | awk '/Mem:/ {print $3}')
mem_used=$(echo "scale=2; ($mem_used_1+$mem_used_2+$mem_used_3)/3" | bc)
# echo $mem_used
# get rate of memory usage
mem_rate=$(echo "scale=2; $mem_used/$(free -m | awk '/Mem:/ {print $2}')*100" | bc)
# echo $mem_rate
if [ $(echo "$mem_rate < $memory_rate_max" | bc) -eq 1 ]; then
echo "total memory rate: $mem_rate is less than $memory_rate_max, continue to check my own cpu and memory usage"
break
else
echo "total memory rate: $mem_rate is greater than $memory_rate_max, sleep 10 seconds"
sleep 10
fi
done;
fi;
# if constrain_mine
if [ $constrain_mine = true ]; then
# check my own cpu and memory usage, it should be less than 1/$constrain_rate of the given cpu_mean_max / memory_rate_max
while true; do
username=$username_mine
cpu_usage_user_sum=$(ps -u $username -o %cpu | awk '{sum+=$1} END {print sum}')
# echo $cpu_usage_user_sum
total_aviable_cpu=$(nproc)
total_aviable_cpu=$(echo "$total_aviable_cpu*100" | bc)
# echo $total_aviable_cpu
cpu_usage_user_ratio=$(echo "scale=2; $cpu_usage_user_sum/$total_aviable_cpu*100" | bc)
# echo $cpu_usage_user_ratio
memory_usage_user_sum=$(ps -u $username -o rss | awk '{sum+=$1} END {print sum/1024}')
# echo $memory_usage_user_sum
memory_usage_total=$(free -m | awk '/Mem:/ {print $2}')
# echo $memory_usage_total
memory_usage_user_ratio=$(echo "scale=2; $memory_usage_user_sum/$memory_usage_total*100" | bc)
# echo $memory_usage_user_ratio
# so my ratio should be less than 1/$constrain_rate of the given threshold
cpu_mean_max_mine=$(echo "$cpu_mean_max/$constrain_rate" | bc)
memory_rate_max_mine=$(echo "$memory_rate_max/$constrain_rate" | bc)
if [ $(echo "$cpu_usage_user_ratio < $cpu_mean_max_mine" | bc) -eq 1 ] && [ $(echo "$memory_usage_user_ratio < $memory_rate_max_mine" | bc) -eq 1 ]; then
echo "my cpu usage: $cpu_usage_user_ratio, memory usage: $memory_usage_user_ratio is less than half of the given threshold for cpu: $cpu_mean_max_mine and memory: $memory_rate_max_mine, ready to take off"
break
else
echo "my cpu usage: $cpu_usage_user_ratio, memory usage: $memory_usage_user_ratio is greater than half of the given threshold, sleep 10 seconds"
sleep 10
fi
done;
fi;
# so all the conditions are satisfied, we can update the device idx and run the next experiment
cnt_longer_sleep=0
while true; do
current_device_idx=$((current_device_idx+1))
if [ $current_device_idx -ge ${#available_devices[@]} ]; then
# reset
current_device_idx=0
fi
# check whether this device is fully booked using nvidia-smi
# get the gpu current memory usage
useage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i ${available_devices[$current_device_idx]})
utilization=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits -i ${available_devices[$current_device_idx]})
if [ $useage -ge $((total_gpu_memory-max_gpu_memory_gap)) ] || [ $utilization -ge $max_gpu_utilization ]; then
echo "device ${available_devices[$current_device_idx]} is fully booked, try next one"
sleep 3
# when cnt_longer_sleep mod $gpu_num == 0, we sleep longer
cnt_longer_sleep=$((cnt_longer_sleep+1))
cnt_longer_sleep=$(echo "$cnt_longer_sleep%${#available_devices[@]}" | bc)
if [ $cnt_longer_sleep -eq 0 ]; then
echo "sleep 60 seconds since all devices are fully booked"
sleep 60
fi
continue
else
break
fi
done
echo "current device: ${available_devices[$current_device_idx]}"
device=${available_devices[$current_device_idx]}
}
"""