SamPIngram commited on
Commit
1e156b2
·
verified ·
1 Parent(s): e5520ce

Upload 2 files

Browse files

gpt2_config.py is training configuration using SimpleLLM. Run for 200,000 iterations on openwebtext dataset

Files changed (2) hide show
  1. ckpt.pt +3 -0
  2. gpt2_config.py +60 -0
ckpt.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eccbc24897667135755aad4694f899a7ed0e62f29a8a00fddb8cf8e2d566d6dc
3
+ size 1492570501
gpt2_config.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##################################################
2
+ # Data config for Shakespeare
3
+ ##################################################
4
+ test_size = 0.1
5
+ seed = 110892
6
+ shuffle = True
7
+ dataset_key = 'train'
8
+ num_proc = -1 # -1 for all, 1 for single process, 2 for two processes, etc.
9
+ tokenizer = 'gpt2' # 'gpt2' or 'cl100k_base' or 'gpt-4'
10
+
11
+ ##################################################
12
+ # Training config for Shakespeare
13
+ ##################################################
14
+ out_dir = 'gpt2'
15
+ eval_interval = 2000
16
+ log_interval = 1
17
+ eval_iters = 200
18
+ eval_only = False # if True, script exits right after the first eval
19
+ always_save_checkpoint = True # if True, always save a checkpoint after each eval
20
+ init_from = 'resume' # 'scratch' or 'resume' or 'gpt2*'
21
+ # wandb logging
22
+ wandb_log = False # disabled by default
23
+ wandb_project = 'SimpleLLM'
24
+ wandb_run_name = 'gpt2' # 'run' + str(time.time())
25
+ # data
26
+ dataset = 'openwebtext'
27
+ gradient_accumulation_steps = 5 * 8 # used to simulate larger batch sizes
28
+ batch_size = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size
29
+ block_size = 1024
30
+ # model
31
+ n_layer = 12
32
+ n_head = 12
33
+ n_embd = 768
34
+ dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
35
+ bias = False # do we use bias inside LayerNorm and Linear layers?
36
+ # adamw optimizer
37
+ learning_rate = 6e-4 # max learning rate
38
+ max_iters = 600000 # total number of training iterations
39
+ weight_decay = 1e-1
40
+ beta1 = 0.9
41
+ beta2 = 0.95
42
+ grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
43
+ # learning rate decay settings
44
+ decay_lr = True # whether to decay the learning rate
45
+ warmup_iters = 2000 # how many steps to warm up for
46
+ lr_decay_iters = 600000 # should be ~= max_iters per Chinchilla
47
+ min_lr = 6e-5 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
48
+ # DDP settings
49
+ backend = 'nccl' # 'nccl', 'gloo', etc.
50
+
51
+ ##################################################
52
+ # Generator config for Shakespeare
53
+ ##################################################
54
+ # init_from = 'resume' # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl')
55
+ start = "\n" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"
56
+ num_samples = 10 # number of samples to draw
57
+ max_new_tokens = 500 # number of tokens generated in each sample
58
+ temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
59
+ top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
60
+ seed = 1337