Yixin Liu commited on
Commit
5f30d29
1 Parent(s): 1782b42
Files changed (1) hide show
  1. main.py +22 -5
main.py CHANGED
@@ -1,4 +1,5 @@
1
  # import imp
 
2
  import streamlit as st
3
  import pandas as pd
4
  import numpy as np
@@ -18,6 +19,9 @@ import time
18
  # title
19
  st.title("Exp Command Generator")
20
 
 
 
 
21
  ## 检查框
22
  debug = st.checkbox("Debug:选择则会串行地执行命令", value=True)
23
  # st.write(f"checkbox的值是{res}")
@@ -41,19 +45,26 @@ lr=5e-5""")
41
 
42
  ## gpu 相关参数
43
  gpu_list = st.multiselect("multi select", range(10), [1, 2, 3, 4, 5, 6, 7, 8, 9])
44
- print(gpu_list)
45
- allow_gpu_memory_threshold = st.number_input("最小单卡剩余容量", value=5000, min_value=0, max_value=30000, step=1000)
46
- gpu_threshold = st.number_input("最大单卡利用率", value=70, min_value=0, max_value=100, step=10)
 
 
 
 
 
 
47
  sleep_time_after_loading_task= st.number_input("加载任务后等待秒数", value=20, min_value=0,step=5)
48
  all_full_sleep_time = st.number_input("全满之后等待秒数", value=20, min_value=0,step=5)
49
 
50
  gpu_list_str = ' '.join([str(i) for i in gpu_list])
51
- gpu_hyper = f"gpu=({gpu_list_str})\n"
52
  gpu_hyper+=f"allow_gpu_memory_threshold={allow_gpu_memory_threshold}\n"
53
  gpu_hyper+=f"gpu_threshold={gpu_threshold}\n"
54
  gpu_hyper+=f"sleep_time_after_loading_task={sleep_time_after_loading_task}s\n"
55
  gpu_hyper+=f"all_full_sleep_time={all_full_sleep_time}s\n"
56
  gpu_hyper+=f"gpunum={len(gpu_list)}\n"
 
57
 
58
  main_loop = st.text_area("Main loop", """for lambda_1 in 1 3;do
59
  for lambda_2 in 1 10;do
@@ -83,7 +94,6 @@ if g:
83
  s += gpu_hyper + "\n\n"
84
  s += hyper_loop + "\n\n"
85
  s += """
86
- i=0 # we search from the first gpu
87
  while true; do
88
  gpu_id=${gpu[$i]}
89
  # nvidia-smi --query-gpu=utilization.gpu --format=csv -i 2 | grep -Eo "[0-9]+"
@@ -103,6 +113,10 @@ while true; do
103
  done
104
 
105
  gpu_id=${gpu[$i]}
 
 
 
 
106
  free_mem=$(nvidia-smi --query-gpu=memory.free --format=csv -i $gpu_id | grep -Eo "[0-9]+")
107
  gpu_u=$(nvidia-smi --query-gpu=utilization.gpu --format=csv -i $gpu_id | grep -Eo "[0-9]+")
108
  export CUDA_VISIBLE_DEVICES=$gpu_id
@@ -113,7 +127,10 @@ echo "use gpu id is ${gpu[$i]}, free memory is $free_mem, it utilization is ${gp
113
  s += "echo ==========================================================================================\n"
114
  if debug:
115
  s += "$com\n"
 
 
116
  else:
 
117
  s += "mkdir -p ./logs/\n"
118
  s += "nohup $com > ./logs/$exp_name-$RANDOM.log 2>&1 &\n"
119
  s += """echo "sleep for $sleep_time_after_loading_task to wait the task loaded"
 
1
  # import imp
2
+ from email.policy import default
3
  import streamlit as st
4
  import pandas as pd
5
  import numpy as np
 
19
  # title
20
  st.title("Exp Command Generator")
21
 
22
+ # experiment mode
23
+ exp_mode = st.selectbox("Select Experiment Mode", ["OneExpOnecard", "MultipleExpOnecard"],key="MultipleExpOnecard")
24
+
25
  ## 检查框
26
  debug = st.checkbox("Debug:选择则会串行地执行命令", value=True)
27
  # st.write(f"checkbox的值是{res}")
 
45
 
46
  ## gpu 相关参数
47
  gpu_list = st.multiselect("multi select", range(10), [1, 2, 3, 4, 5, 6, 7, 8, 9])
48
+ # print(gpu_list)
49
+ if exp_mode == "OneExpOnecard":
50
+ allow_gpu_memory_threshold_default = 20000
51
+ gpu_threshold_default = 1
52
+ elif exp_mode == "MultipleExpOnecard":
53
+ allow_gpu_memory_threshold_default = 5000
54
+ gpu_threshold_default = 70
55
+ allow_gpu_memory_threshold = st.number_input("最小单卡剩余容量", value=allow_gpu_memory_threshold_default, min_value=0, max_value=30000, step=1000)
56
+ gpu_threshold = st.number_input("最大单卡利用率", value=gpu_threshold_default, min_value=0, max_value=100, step=10)
57
  sleep_time_after_loading_task= st.number_input("加载任务后等待秒数", value=20, min_value=0,step=5)
58
  all_full_sleep_time = st.number_input("全满之后等待秒数", value=20, min_value=0,step=5)
59
 
60
  gpu_list_str = ' '.join([str(i) for i in gpu_list])
61
+ gpu_hyper = "gpu=$\{#gpu[@]}\n"
62
  gpu_hyper+=f"allow_gpu_memory_threshold={allow_gpu_memory_threshold}\n"
63
  gpu_hyper+=f"gpu_threshold={gpu_threshold}\n"
64
  gpu_hyper+=f"sleep_time_after_loading_task={sleep_time_after_loading_task}s\n"
65
  gpu_hyper+=f"all_full_sleep_time={all_full_sleep_time}s\n"
66
  gpu_hyper+=f"gpunum={len(gpu_list)}\n"
67
+ gpu_hyper+="i=0\n"
68
 
69
  main_loop = st.text_area("Main loop", """for lambda_1 in 1 3;do
70
  for lambda_2 in 1 10;do
 
94
  s += gpu_hyper + "\n\n"
95
  s += hyper_loop + "\n\n"
96
  s += """
 
97
  while true; do
98
  gpu_id=${gpu[$i]}
99
  # nvidia-smi --query-gpu=utilization.gpu --format=csv -i 2 | grep -Eo "[0-9]+"
 
113
  done
114
 
115
  gpu_id=${gpu[$i]}
116
+ # search from the next gpu
117
+ i=`expr $i + 1`
118
+ i=`expr $i % $gpunum`
119
+
120
  free_mem=$(nvidia-smi --query-gpu=memory.free --format=csv -i $gpu_id | grep -Eo "[0-9]+")
121
  gpu_u=$(nvidia-smi --query-gpu=utilization.gpu --format=csv -i $gpu_id | grep -Eo "[0-9]+")
122
  export CUDA_VISIBLE_DEVICES=$gpu_id
 
127
  s += "echo ==========================================================================================\n"
128
  if debug:
129
  s += "$com\n"
130
+ s += "# mkdir -p ./logs/\n"
131
+ s += "# nohup $com > ./logs/$exp_name-$RANDOM.log 2>&1 &\n"
132
  else:
133
+ s += "# $com\n"
134
  s += "mkdir -p ./logs/\n"
135
  s += "nohup $com > ./logs/$exp_name-$RANDOM.log 2>&1 &\n"
136
  s += """echo "sleep for $sleep_time_after_loading_task to wait the task loaded"