Spaces:

yixin
/

Experiment-Command-Generator

Runtime error

Yixin Liu

upload

7017395 over 1 year ago

5.66 kB

	# import imp
	from email.policy import default
	import streamlit as st
	import pandas as pd
	import numpy as np
	import time
	# import matplotlib.pyplot as plt
	# import seaborn as sns
	# import plotly.figure_factory as ff
	# import altair as alt
	# from PIL import Image
	# import base64
	# import tarfile
	# import os
	# import requests



	# title
	st.title("Exp Command Generator")

	# experiment mode
	exp_mode = st.sidebar.selectbox("Select Experiment Mode", ["MultipleExpOnecard","OneExpOnecard"],key="MultipleExpOnecard")

	## 检查框
	debug = st.sidebar.checkbox("Debug:选择则会串行地执行命令", value=True)
	# st.sidebar.write(f"checkbox的值是{res}")

	setup = st.sidebar.text_area("Hyperparameters or some setup of env at beginning.", """cd $(dirname $(dirname $0))
	source activate xai
	export PYTHONPATH=${PYTHONPATH}:/Users/apple/Desktop/workspace/research_project/attention:/mnt/yixin/:/home/yila22/prj""")

	# exp_hyper = st.sidebar.text_area("Hyperparameters", """exp_name="debug-adv-training-emotion"
	# dataset=emotion
	# n_epoch=3
	# K=3
	# encoder=bert
	# lambda_1=1
	# lambda_2=1
	# x_pgd_radius=0.01
	# pgd_radius=0.001
	# seed=2
	# bsize=8
	# lr=5e-5""")

	## gpu 相关参数
	gpu_list = st.sidebar.multiselect("multi select", range(10), [5, 6, 7, 8, 9])
	# print(gpu_list)
	if exp_mode == "OneExpOnecard":
	allow_gpu_memory_threshold_default = 20000
	gpu_threshold_default = 1
	elif exp_mode == "MultipleExpOnecard":
	allow_gpu_memory_threshold_default = 3000
	gpu_threshold_default = 70
	allow_gpu_memory_threshold = st.sidebar.number_input("最小单卡剩余容量", value=allow_gpu_memory_threshold_default, min_value=0, max_value=30000, step=1000)
	gpu_threshold = st.sidebar.number_input("最大单卡利用率", value=gpu_threshold_default, min_value=0, max_value=100, step=10)
	sleep_time_after_loading_task= st.sidebar.number_input("加载任务后等待秒数", value=20, min_value=0,step=5)
	all_full_sleep_time = st.sidebar.number_input("全满之后等待秒数", value=20, min_value=0,step=5)

	gpu_list_str = ' '.join([str(i) for i in gpu_list])
	gpu_hyper = f"gpu=({gpu_list_str})\n"
	gpu_hyper+=f"allow_gpu_memory_threshold={allow_gpu_memory_threshold}\n"
	gpu_hyper+=f"gpu_threshold={gpu_threshold}\n"
	gpu_hyper+=f"sleep_time_after_loading_task={sleep_time_after_loading_task}s\n"
	gpu_hyper+=f"all_full_sleep_time={all_full_sleep_time}s\n"
	gpu_hyper+="gpunum=${#gpu[@]}\n"
	gpu_hyper+="i=0\n"

	main_loop = st.text_area("Main loop", """for lambda_1 in 1 3;do
	for lambda_2 in 1 10;do
	for n_epoch in 3;do
	for x_pgd_radius in 0.005 0.01;do
	for pgd_radius in 0.0005 0.001 0.002;do
	python train.py --dataset $dataset --data_dir . --output_dir ./outputs/ --attention tanh \
	--encoder $encoder \
	--exp_name $exp_name --lambda_1 $lambda_1 --lambda_2 $lambda_2 --pgd_radius $pgd_radius --x_pgd_radius $x_pgd_radius \
	--K $K --seed $seed --train_mode adv_train --bsize $bsize --n_epoch $n_epoch --lr $lr \
	--eval_baseline
	done;done;done;done;done;""")
	if 'python' in main_loop:
	hyper_loop = main_loop.split("python")[0]
	python_cmd = main_loop[main_loop.index('python'):].split('done;')[0]
	elif 'bash' in main_loop:
	hyper_loop = main_loop.split("bash")[0]
	python_cmd = main_loop[main_loop.index('bash'):].split('done;')[0]
	print(hyper_loop)
	print(python_cmd)
	end_loop = "done;"*hyper_loop.count("for")
	print(end_loop)

	g = st.button("Generate")
	if g:
	s = ""
	s += setup + "\n\n"
	# s += exp_hyper + "\n\n"
	s += gpu_hyper + "\n\n"
	s += hyper_loop + "\n\n"
	s += """
	while true; do
	gpu_id=${gpu[$i]}
	# nvidia-smi --query-gpu=utilization.gpu --format=csv -i 2 \| grep -Eo "[0-9]+"
	gpu_u=$(nvidia-smi --query-gpu=utilization.gpu --format=csv -i $gpu_id \| grep -Eo "[0-9]+")
	free_mem=$(nvidia-smi --query-gpu=memory.free --format=csv -i $gpu_id \| grep -Eo "[0-9]+")
	if [[ $free_mem -lt $allow_gpu_memory_threshold \|\| $gpu_u -ge ${gpu_threshold} ]]; then
	i=`expr $i + 1`
	i=`expr $i % $gpunum`
	echo "gpu id ${gpu[$i]} is full loaded, skip"
	if [ "$i" == "0" ]; then
	sleep ${all_full_sleep_time}
	echo "all the gpus are full, sleep 1m"
	fi
	else
	break
	fi
	done

	gpu_id=${gpu[$i]}
	# search from the next gpu
	i=`expr $i + 1`
	i=`expr $i % $gpunum`

	free_mem=$(nvidia-smi --query-gpu=memory.free --format=csv -i $gpu_id \| grep -Eo "[0-9]+")
	gpu_u=$(nvidia-smi --query-gpu=utilization.gpu --format=csv -i $gpu_id \| grep -Eo "[0-9]+")
	export CUDA_VISIBLE_DEVICES=$gpu_id
	echo "use gpu id is ${gpu[$i]}, free memory is $free_mem, it utilization is ${gpu_u}%"
	"""
	s += f"""com="{python_cmd}"\n"""
	s += "echo $com\n"
	s += "echo ==========================================================================================\n"
	if debug:
	s += "$com\n"
	s += "# mkdir -p ./logs/\n"
	s += "# nohup $com > ./logs/$exp_name-$RANDOM.log 2>&1 &\n"
	else:
	s += "# $com\n"
	s += "mkdir -p ./logs/\n"
	import time
	from datetime import datetime
	current_timestamp = int(time.time())
	human_readable_timestamp = datetime.utcfromtimestamp(current_timestamp).strftime('%Y-%m-%d-%H')
	s += f"date_time={human_readable_timestamp}\n"
	s += "mkdir -p ./logs/$date_time\n"
	s += "nohup $com > ./logs/$date_time/$exp_name-$RANDOM.log 2>&1 &\n"
	s += """echo "sleep for $sleep_time_after_loading_task to wait the task loaded"
	sleep $sleep_time_after_loading_task\n"""
	s += end_loop
	st.success("Finished")
	st.code(s, language="shell")