Upload llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16

Browse files

Files changed (3) hide show

llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/bench.slurm +19 -2
llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/crash_dump_7398594_0.txt +0 -0
llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/log.out +0 -0

llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/bench.slurm CHANGED Viewed

@@ -31,6 +31,12 @@ update_status() {
     done
 }
 # Misc initializations.
 echo "========================"
 echo "START TIME: $(date)"
@@ -75,9 +81,21 @@ job_id=${SLURM_JOB_ID}
 update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/status.txt &
 # Run the main command
-srun -u $LAUNCHER $CMD
 exit_status=$?
 # Update status based on the exit status of `srun`
 if [ $exit_status -eq 0 ]; then
     printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/status.txt
@@ -99,7 +117,6 @@ if [ $exit_status -eq 0 ]; then
     python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16 --is_profiler
 fi
 # Push to hub the folder using huggingface_cli
 huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16 llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16"

     done
 }
+dump_stack_trace() {
+    local pid=$1
+    local output_file=$2
+    py-spy dump --pid $pid > $output_file
+}
 # Misc initializations.
 echo "========================"
 echo "START TIME: $(date)"
 update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/status.txt &
 # Run the main command
+srun -u $LAUNCHER $CMD &
+main_pid=$!
+# Wait for the main process to finish
+wait $main_pid
 exit_status=$?
+# If the exit status is non-zero, dump the stack trace
+if [ $exit_status -ne 0 ]; then
+    dump_file="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/crash_dump_${SLURM_JOB_ID}_${SLURM_PROCID}.txt"
+    echo "Job crashed. Dumping stack trace to $dump_file"
+    dump_stack_trace $main_pid $dump_file
+fi
 # Update status based on the exit status of `srun`
 if [ $exit_status -eq 0 ]; then
     printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/status.txt
     python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16 --is_profiler
 fi
 # Push to hub the folder using huggingface_cli
 huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16 llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16"

llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/crash_dump_7398594_0.txt ADDED Viewed

File without changes

llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/log.out CHANGED Viewed

The diff for this file is too large to render. See raw diff