#!/bin/bash #SBATCH -p g48 #SBATCH --job-name=myjob_shareGPT #SBATCH --qos=high #SBATCH --nodes=1 # Number of nodes #SBATCH --ntasks=1 # Number of tasks (one for each script) #SBATCH --cpus-per-task=60 #SBATCH --gres=gpu:6 #SBATCH --array=1-2 # Array range # #SBATCH --output=./slurm_outputs/run_clm_job_%A_task_%a.out # Standard output #SBATCH --output=/dev/null # Discard standard output # Because we write to the log.txt file # # Get the current date and time current_time=$(date +"%d-%m_%H-%M") OUTPUT_DIR="./training_outputs_job_${SLURM_ARRAY_JOB_ID}_${SLURM_ARRAY_TASK_ID}_${current_time}" while test $# -gt 0; do echo $1 case "$1" in --output_dir) shift OUTPUT_DIR=$1 shift ;; esac done mkdir_is_exists() { if [ -d "$1" ]; then echo "Directory '$1' already exists." else mkdir -p "$1" echo "Directory '$1' created." fi } mkdir_is_exists $OUTPUT_DIR mkdir_is_exists $OUTPUT_DIR/experiment_code git log -n 1 > $OUTPUT_DIR/commit.txt pip freeze > $OUTPUT_DIR/pip_freeze.txt echo $0 $ARGS $current_time > $OUTPUT_DIR/cmd.txt cp -r ./run_clm.py $OUTPUT_DIR/experiment_code cp -r ./prepare_sharegpt.py $OUTPUT_DIR/experiment_code cp -r config $OUTPUT_DIR/experiment_code cp -r ./submit_job.sh $OUTPUT_DIR/experiment_code cp -r ./requirements.txt $OUTPUT_DIR/experiment_code # Define the Python scripts and their corresponding input files declare -A scripts_and_inputs=( ["1"]="./config/config1.yaml" ["2"]="./config/config_redpajama.yaml" # ["3"]="./config/config1.yaml" # ["4"]="./config/config1.yaml" # ["5"]="./config/config1.yaml" # ["6"]="./config/config1.yaml" # ["7"]="./config/config1.yaml" # ["8"]="./config/config1.yaml" # ["9"]="./config/config1.yaml" # ["10"]="./config/config1.yaml" # ["11"]="./config/config1.yaml" # ["12"]="./config/config1.yaml" # ["13"]="./config/config1.yaml" # ["14"]="./config/config1.yaml" # ["15"]="./config/config1.yaml" # ["16"]="./config/config1.yaml" # ["17"]="./config/config1.yaml" # ["18"]="./config/config1.yaml" # ["19"]="./config/config1.yaml" # ["20"]="./config/config1.yaml" ) # Launch each script with its corresponding input file as a separate task echo "Starting job array task: $SLURM_ARRAY_TASK_ID" INPUT_DIR="${scripts_and_inputs[$SLURM_ARRAY_TASK_ID]}" export DEFAULT_CONFIG_FILE="./config/config1.yaml" srun --exclusive python run_clm.py --output_dir $OUTPUT_DIR --logging_dir $OUTPUT_DIR --config_file $INPUT_DIR 2>&1 | tee $OUTPUT_DIR/log.txt # Wait for all background jobs to complete wait # Print a message indicating completion echo "All Python scripts have been executed." # mv ./slurm_outputs/run_clm_job_$SLURM_ARRAY_JOB_ID*$SLURM_ARRAY_TASK_ID* "$output_dir/" # python -m torch.distributed.launch ~/target_draft_coupling_code/target_draft_training/run_clm.py --multirun task=1,2