Upload llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16
Browse files
llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/bench.slurm
CHANGED
@@ -31,6 +31,12 @@ update_status() {
|
|
31 |
done
|
32 |
}
|
33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
# Misc initializations.
|
35 |
echo "========================"
|
36 |
echo "START TIME: $(date)"
|
@@ -75,9 +81,21 @@ job_id=${SLURM_JOB_ID}
|
|
75 |
update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/status.txt &
|
76 |
|
77 |
# Run the main command
|
78 |
-
srun -u $LAUNCHER $CMD
|
|
|
|
|
|
|
|
|
|
|
79 |
exit_status=$?
|
80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
# Update status based on the exit status of `srun`
|
82 |
if [ $exit_status -eq 0 ]; then
|
83 |
printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/status.txt
|
@@ -99,7 +117,6 @@ if [ $exit_status -eq 0 ]; then
|
|
99 |
python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16 --is_profiler
|
100 |
fi
|
101 |
|
102 |
-
|
103 |
# Push to hub the folder using huggingface_cli
|
104 |
huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16 llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16"
|
105 |
|
|
|
31 |
done
|
32 |
}
|
33 |
|
34 |
+
dump_stack_trace() {
|
35 |
+
local pid=$1
|
36 |
+
local output_file=$2
|
37 |
+
py-spy dump --pid $pid > $output_file
|
38 |
+
}
|
39 |
+
|
40 |
# Misc initializations.
|
41 |
echo "========================"
|
42 |
echo "START TIME: $(date)"
|
|
|
81 |
update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/status.txt &
|
82 |
|
83 |
# Run the main command
|
84 |
+
srun -u $LAUNCHER $CMD &
|
85 |
+
|
86 |
+
main_pid=$!
|
87 |
+
|
88 |
+
# Wait for the main process to finish
|
89 |
+
wait $main_pid
|
90 |
exit_status=$?
|
91 |
|
92 |
+
# If the exit status is non-zero, dump the stack trace
|
93 |
+
if [ $exit_status -ne 0 ]; then
|
94 |
+
dump_file="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/crash_dump_${SLURM_JOB_ID}_${SLURM_PROCID}.txt"
|
95 |
+
echo "Job crashed. Dumping stack trace to $dump_file"
|
96 |
+
dump_stack_trace $main_pid $dump_file
|
97 |
+
fi
|
98 |
+
|
99 |
# Update status based on the exit status of `srun`
|
100 |
if [ $exit_status -eq 0 ]; then
|
101 |
printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/status.txt
|
|
|
117 |
python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16 --is_profiler
|
118 |
fi
|
119 |
|
|
|
120 |
# Push to hub the folder using huggingface_cli
|
121 |
huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16 llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16"
|
122 |
|
llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/crash_dump_7398594_0.txt
ADDED
File without changes
|
llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/log.out
CHANGED
The diff for this file is too large to render.
See raw diff
|
|