Spaces:
Runtime error
Runtime error
File size: 5,661 Bytes
7017395 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
# import imp
from email.policy import default
import streamlit as st
import pandas as pd
import numpy as np
import time
# import matplotlib.pyplot as plt
# import seaborn as sns
# import plotly.figure_factory as ff
# import altair as alt
# from PIL import Image
# import base64
# import tarfile
# import os
# import requests
# title
st.title("Exp Command Generator")
# experiment mode
exp_mode = st.sidebar.selectbox("Select Experiment Mode", ["MultipleExpOnecard","OneExpOnecard"],key="MultipleExpOnecard")
## 检查框
debug = st.sidebar.checkbox("Debug:选择则会串行地执行命令", value=True)
# st.sidebar.write(f"checkbox的值是{res}")
setup = st.sidebar.text_area("Hyperparameters or some setup of env at beginning.", """cd $(dirname $(dirname $0))
source activate xai
export PYTHONPATH=${PYTHONPATH}:/Users/apple/Desktop/workspace/research_project/attention:/mnt/yixin/:/home/yila22/prj""")
# exp_hyper = st.sidebar.text_area("Hyperparameters", """exp_name="debug-adv-training-emotion"
# dataset=emotion
# n_epoch=3
# K=3
# encoder=bert
# lambda_1=1
# lambda_2=1
# x_pgd_radius=0.01
# pgd_radius=0.001
# seed=2
# bsize=8
# lr=5e-5""")
## gpu 相关参数
gpu_list = st.sidebar.multiselect("multi select", range(10), [5, 6, 7, 8, 9])
# print(gpu_list)
if exp_mode == "OneExpOnecard":
allow_gpu_memory_threshold_default = 20000
gpu_threshold_default = 1
elif exp_mode == "MultipleExpOnecard":
allow_gpu_memory_threshold_default = 3000
gpu_threshold_default = 70
allow_gpu_memory_threshold = st.sidebar.number_input("最小单卡剩余容量", value=allow_gpu_memory_threshold_default, min_value=0, max_value=30000, step=1000)
gpu_threshold = st.sidebar.number_input("最大单卡利用率", value=gpu_threshold_default, min_value=0, max_value=100, step=10)
sleep_time_after_loading_task= st.sidebar.number_input("加载任务后等待秒数", value=20, min_value=0,step=5)
all_full_sleep_time = st.sidebar.number_input("全满之后等待秒数", value=20, min_value=0,step=5)
gpu_list_str = ' '.join([str(i) for i in gpu_list])
gpu_hyper = f"gpu=({gpu_list_str})\n"
gpu_hyper+=f"allow_gpu_memory_threshold={allow_gpu_memory_threshold}\n"
gpu_hyper+=f"gpu_threshold={gpu_threshold}\n"
gpu_hyper+=f"sleep_time_after_loading_task={sleep_time_after_loading_task}s\n"
gpu_hyper+=f"all_full_sleep_time={all_full_sleep_time}s\n"
gpu_hyper+="gpunum=${#gpu[@]}\n"
gpu_hyper+="i=0\n"
main_loop = st.text_area("Main loop", """for lambda_1 in 1 3;do
for lambda_2 in 1 10;do
for n_epoch in 3;do
for x_pgd_radius in 0.005 0.01;do
for pgd_radius in 0.0005 0.001 0.002;do
python train.py --dataset $dataset --data_dir . --output_dir ./outputs/ --attention tanh \
--encoder $encoder \
--exp_name $exp_name --lambda_1 $lambda_1 --lambda_2 $lambda_2 --pgd_radius $pgd_radius --x_pgd_radius $x_pgd_radius \
--K $K --seed $seed --train_mode adv_train --bsize $bsize --n_epoch $n_epoch --lr $lr \
--eval_baseline
done;done;done;done;done;""")
if 'python' in main_loop:
hyper_loop = main_loop.split("python")[0]
python_cmd = main_loop[main_loop.index('python'):].split('done;')[0]
elif 'bash' in main_loop:
hyper_loop = main_loop.split("bash")[0]
python_cmd = main_loop[main_loop.index('bash'):].split('done;')[0]
print(hyper_loop)
print(python_cmd)
end_loop = "done;"*hyper_loop.count("for")
print(end_loop)
g = st.button("Generate")
if g:
s = ""
s += setup + "\n\n"
# s += exp_hyper + "\n\n"
s += gpu_hyper + "\n\n"
s += hyper_loop + "\n\n"
s += """
while true; do
gpu_id=${gpu[$i]}
# nvidia-smi --query-gpu=utilization.gpu --format=csv -i 2 | grep -Eo "[0-9]+"
gpu_u=$(nvidia-smi --query-gpu=utilization.gpu --format=csv -i $gpu_id | grep -Eo "[0-9]+")
free_mem=$(nvidia-smi --query-gpu=memory.free --format=csv -i $gpu_id | grep -Eo "[0-9]+")
if [[ $free_mem -lt $allow_gpu_memory_threshold || $gpu_u -ge ${gpu_threshold} ]]; then
i=`expr $i + 1`
i=`expr $i % $gpunum`
echo "gpu id ${gpu[$i]} is full loaded, skip"
if [ "$i" == "0" ]; then
sleep ${all_full_sleep_time}
echo "all the gpus are full, sleep 1m"
fi
else
break
fi
done
gpu_id=${gpu[$i]}
# search from the next gpu
i=`expr $i + 1`
i=`expr $i % $gpunum`
free_mem=$(nvidia-smi --query-gpu=memory.free --format=csv -i $gpu_id | grep -Eo "[0-9]+")
gpu_u=$(nvidia-smi --query-gpu=utilization.gpu --format=csv -i $gpu_id | grep -Eo "[0-9]+")
export CUDA_VISIBLE_DEVICES=$gpu_id
echo "use gpu id is ${gpu[$i]}, free memory is $free_mem, it utilization is ${gpu_u}%"
"""
s += f"""com="{python_cmd}"\n"""
s += "echo $com\n"
s += "echo ==========================================================================================\n"
if debug:
s += "$com\n"
s += "# mkdir -p ./logs/\n"
s += "# nohup $com > ./logs/$exp_name-$RANDOM.log 2>&1 &\n"
else:
s += "# $com\n"
s += "mkdir -p ./logs/\n"
import time
from datetime import datetime
current_timestamp = int(time.time())
human_readable_timestamp = datetime.utcfromtimestamp(current_timestamp).strftime('%Y-%m-%d-%H')
s += f"date_time={human_readable_timestamp}\n"
s += "mkdir -p ./logs/$date_time\n"
s += "nohup $com > ./logs/$date_time/$exp_name-$RANDOM.log 2>&1 &\n"
s += """echo "sleep for $sleep_time_after_loading_task to wait the task loaded"
sleep $sleep_time_after_loading_task\n"""
s += end_loop
st.success("Finished")
st.code(s, language="shell")
|