#SBATCH -p g48 | |
#SBATCH --job-name=myjob_shareGPT | |
#SBATCH --qos=high | |
#SBATCH --nodes=1 # Number of nodes | |
#SBATCH --ntasks=1 # Number of tasks (one for each script) | |
#SBATCH --cpus-per-task=60 | |
#SBATCH --gres=gpu:6 | |
#SBATCH --array=1-2 # Array range | |
# #SBATCH --output=./slurm_outputs/run_clm_job_%A_task_%a.out # Standard output | |
#SBATCH --output=/dev/null # Discard standard output # Because we write to the log.txt file | |
# # Get the current date and time | |
current_time=$(date +"%d-%m_%H-%M") | |
OUTPUT_DIR="./training_outputs_job_${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}_${current_time}" | |
while test $# -gt 0; do | |
echo $1 | |
case "$1" in | |
--output_dir) | |
shift | |
OUTPUT_DIR=$1 | |
shift | |
;; | |
esac | |
done | |
mkdir_is_exists() { | |
if [ -d "$1" ]; then | |
echo "Directory '$1' already exists." | |
else | |
mkdir -p "$1" | |
echo "Directory '$1' created." | |
fi | |
} | |
mkdir_is_exists $OUTPUT_DIR | |
mkdir_is_exists $OUTPUT_DIR/experiment_code | |
git log -n 1 > $OUTPUT_DIR/commit.txt | |
pip freeze > $OUTPUT_DIR/pip_freeze.txt | |
echo $0 $ARGS $current_time > $OUTPUT_DIR/cmd.txt | |
cp -r ./run_clm.py $OUTPUT_DIR/experiment_code | |
cp -r ./prepare_sharegpt.py $OUTPUT_DIR/experiment_code | |
cp -r config $OUTPUT_DIR/experiment_code | |
cp -r ./submit_job.sh $OUTPUT_DIR/experiment_code | |
cp -r ./requirements.txt $OUTPUT_DIR/experiment_code | |
# Define the Python scripts and their corresponding input files | |
declare -A scripts_and_inputs=( | |
["1"]="./config/config1.yaml" | |
["2"]="./config/config_redpajama.yaml" | |
# ["3"]="./config/config1.yaml" | |
# ["4"]="./config/config1.yaml" | |
# ["5"]="./config/config1.yaml" | |
# ["6"]="./config/config1.yaml" | |
# ["7"]="./config/config1.yaml" | |
# ["8"]="./config/config1.yaml" | |
# ["9"]="./config/config1.yaml" | |
# ["10"]="./config/config1.yaml" | |
# ["11"]="./config/config1.yaml" | |
# ["12"]="./config/config1.yaml" | |
# ["13"]="./config/config1.yaml" | |
# ["14"]="./config/config1.yaml" | |
# ["15"]="./config/config1.yaml" | |
# ["16"]="./config/config1.yaml" | |
# ["17"]="./config/config1.yaml" | |
# ["18"]="./config/config1.yaml" | |
# ["19"]="./config/config1.yaml" | |
# ["20"]="./config/config1.yaml" | |
) | |
# Launch each script with its corresponding input file as a separate task | |
echo "Starting job array task: $SLURM_ARRAY_TASK_ID" | |
INPUT_DIR="${scripts_and_inputs[$SLURM_ARRAY_TASK_ID]}" | |
export DEFAULT_CONFIG_FILE="./config/config1.yaml" | |
srun --exclusive python run_clm.py --output_dir $OUTPUT_DIR --logging_dir $OUTPUT_DIR --config_file $INPUT_DIR 2>&1 | tee $OUTPUT_DIR/log.txt | |
# Wait for all background jobs to complete | |
wait | |
# Print a message indicating completion | |
echo "All Python scripts have been executed." | |
# mv ./slurm_outputs/run_clm_job_$SLURM_ARRAY_JOB_ID*$SLURM_ARRAY_TASK_ID* "$output_dir/" | |
# python -m torch.distributed.launch ~/target_draft_coupling_code/target_draft_training/run_clm.py --multirun task=1,2 |