Spaces:
Runtime error
Runtime error
update_device_command = "update_device_idx;\n" | |
update_device_func = """ | |
function update_device_idx { | |
if [ $constrain_total = true ]; then | |
# check total cpu usage | |
while true; do | |
cpu_mean_1=$(mpstat -P ALL 1 1 | awk '/Average:/ && $2 ~ /[0-9]/ { cpu_usage=100-$NF; total+=cpu_usage; count++ } END { print total/count }') | |
sleep 1 | |
cpu_mean_2=$(mpstat -P ALL 1 1 | awk '/Average:/ && $2 ~ /[0-9]/ { cpu_usage=100-$NF; total+=cpu_usage; count++ } END { print total/count }') | |
sleep 1 | |
cpu_mean_3=$(mpstat -P ALL 1 1 | awk '/Average:/ && $2 ~ /[0-9]/ { cpu_usage=100-$NF; total+=cpu_usage; count++ } END { print total/count }') | |
cpu_mean=$(echo "scale=2; ($cpu_mean_1+$cpu_mean_2+$cpu_mean_3)/3" | bc) | |
# if currently cpu usage is less than the threshold, then break | |
if [ $(echo "$cpu_mean < $cpu_mean_max" | bc) -eq 1 ]; then | |
echo "total cpu mean: $cpu_mean is less than $cpu_mean_max, continue to check total memory usage" | |
break | |
else | |
echo "total cpu mean: $cpu_mean is greater than $cpu_mean_max, sleep 10 seconds" | |
sleep 10 | |
fi | |
done; | |
# check total memory usage | |
while true; do | |
# get memory usage of whole system | |
mem_used_1=$(free -m | awk '/Mem:/ {print $3}') | |
sleep 1 | |
mem_used_2=$(free -m | awk '/Mem:/ {print $3}') | |
sleep 1 | |
mem_used_3=$(free -m | awk '/Mem:/ {print $3}') | |
mem_used=$(echo "scale=2; ($mem_used_1+$mem_used_2+$mem_used_3)/3" | bc) | |
# echo $mem_used | |
# get rate of memory usage | |
mem_rate=$(echo "scale=2; $mem_used/$(free -m | awk '/Mem:/ {print $2}')*100" | bc) | |
# echo $mem_rate | |
if [ $(echo "$mem_rate < $memory_rate_max" | bc) -eq 1 ]; then | |
echo "total memory rate: $mem_rate is less than $memory_rate_max, continue to check my own cpu and memory usage" | |
break | |
else | |
echo "total memory rate: $mem_rate is greater than $memory_rate_max, sleep 10 seconds" | |
sleep 10 | |
fi | |
done; | |
fi; | |
# if constrain_mine | |
if [ $constrain_mine = true ]; then | |
# check my own cpu and memory usage, it should be less than 1/$constrain_rate of the given cpu_mean_max / memory_rate_max | |
while true; do | |
username=$username_mine | |
cpu_usage_user_sum=$(ps -u $username -o %cpu | awk '{sum+=$1} END {print sum}') | |
# echo $cpu_usage_user_sum | |
total_aviable_cpu=$(nproc) | |
total_aviable_cpu=$(echo "$total_aviable_cpu*100" | bc) | |
# echo $total_aviable_cpu | |
cpu_usage_user_ratio=$(echo "scale=2; $cpu_usage_user_sum/$total_aviable_cpu*100" | bc) | |
# echo $cpu_usage_user_ratio | |
memory_usage_user_sum=$(ps -u $username -o rss | awk '{sum+=$1} END {print sum/1024}') | |
# echo $memory_usage_user_sum | |
memory_usage_total=$(free -m | awk '/Mem:/ {print $2}') | |
# echo $memory_usage_total | |
memory_usage_user_ratio=$(echo "scale=2; $memory_usage_user_sum/$memory_usage_total*100" | bc) | |
# echo $memory_usage_user_ratio | |
# so my ratio should be less than 1/$constrain_rate of the given threshold | |
cpu_mean_max_mine=$(echo "$cpu_mean_max/$constrain_rate" | bc) | |
memory_rate_max_mine=$(echo "$memory_rate_max/$constrain_rate" | bc) | |
if [ $(echo "$cpu_usage_user_ratio < $cpu_mean_max_mine" | bc) -eq 1 ] && [ $(echo "$memory_usage_user_ratio < $memory_rate_max_mine" | bc) -eq 1 ]; then | |
echo "my cpu usage: $cpu_usage_user_ratio, memory usage: $memory_usage_user_ratio is less than half of the given threshold for cpu: $cpu_mean_max_mine and memory: $memory_rate_max_mine, ready to take off" | |
break | |
else | |
echo "my cpu usage: $cpu_usage_user_ratio, memory usage: $memory_usage_user_ratio is greater than half of the given threshold, sleep 10 seconds" | |
sleep 10 | |
fi | |
done; | |
fi; | |
# so all the conditions are satisfied, we can update the device idx and run the next experiment | |
cnt_longer_sleep=0 | |
while true; do | |
current_device_idx=$((current_device_idx+1)) | |
if [ $current_device_idx -ge ${#available_devices[@]} ]; then | |
# reset | |
current_device_idx=0 | |
fi | |
# check whether this device is fully booked using nvidia-smi | |
# get the gpu current memory usage | |
useage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i ${available_devices[$current_device_idx]}) | |
utilization=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits -i ${available_devices[$current_device_idx]}) | |
if [ $useage -ge $((total_gpu_memory-max_gpu_memory_gap)) ] || [ $utilization -ge $max_gpu_utilization ]; then | |
echo "device ${available_devices[$current_device_idx]} is fully booked, try next one" | |
sleep 3 | |
# when cnt_longer_sleep mod $gpu_num == 0, we sleep longer | |
cnt_longer_sleep=$((cnt_longer_sleep+1)) | |
cnt_longer_sleep=$(echo "$cnt_longer_sleep%${#available_devices[@]}" | bc) | |
if [ $cnt_longer_sleep -eq 0 ]; then | |
echo "sleep 60 seconds since all devices are fully booked" | |
sleep 60 | |
fi | |
continue | |
else | |
break | |
fi | |
done | |
echo "current device: ${available_devices[$current_device_idx]}" | |
device=${available_devices[$current_device_idx]} | |
} | |
""" |