jrahn commited on
Commit
f227cda
1 Parent(s): b1e1d29

Upload run_gpt2_350M_edu_hermes.sh with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_gpt2_350M_edu_hermes.sh +44 -0
run_gpt2_350M_edu_hermes.sh ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GPT-3 (124M) repro on FineWeb
2
+ # 124M parameter model on 300B tokens
3
+ # note context length: 1024 -> 2048 for GPT-3
4
+ # => 6 * 124e6 * 300e9 = 7.44e18 ~= 2.2e20 capability model
5
+ # 565,950 steps of 524,288 tokens/step
6
+ # on 8X A100 80GB SXM ($14/hr) steps in ~300ms/iter
7
+ # => training time 565,950 * 300ms ~= 47 hours ~= $658
8
+
9
+ make train_gpt2cu USE_CUDNN=1
10
+ out_dir="log_gpt2_350M_edu_hermes"
11
+ done_file="$out_dir/DONE_00019622"
12
+
13
+ while true; do
14
+
15
+ # exit condition is that optimization has finished
16
+ if [ -f "$done_file" ]; then
17
+ echo "File $done_file exists. Exiting the loop."
18
+ break
19
+ fi
20
+
21
+ # run python dev/data/fineweb_edu_hermes.py --version 10B to prepro data
22
+ # run python dev/data/hellaswag.py to prepro hellaswag eval
23
+ mpirun -np 2 ./train_gpt2cu \
24
+ -i "dev/data/edu_fineweb10B_hermes/edu_fineweb_hermes_train_*.bin" \
25
+ -j "dev/data/edu_fineweb10B_hermes/edu_fineweb_hermes_val_*.bin" \
26
+ -o $out_dir \
27
+ -v 250 -s 5000 -g 144 \
28
+ -h 1 \
29
+ -b 16 -t 1024 \
30
+ -d 524288 \
31
+ -r 0 \
32
+ -z 1 \
33
+ -c 0.1 \
34
+ -l 0.0003 \
35
+ -q 0.0 \
36
+ -u 700 \
37
+ -n 5000 \
38
+ -sl 7.0 -sg 7.0 \
39
+ -y 1 \
40
+ -x 19622 \
41
+ -e "d24"
42
+
43
+ sleep 1
44
+ done