import time #out_dir = 'out-owt-gpt2mini' out_dir = 'out-funcom_raw_scratch' eval_interval = 1000 eval_iters = 40 wandb_log = True # feel free to turn on wandb_project = 'fundats_srcml' wandb_run_name = 'ft-gpt2-srcml-1' #+ str(time.time()) dataset = 'fundats_srcml' init_from = 'scratch' #init_from = 'gpt2-large' # only save checkpoints if the validation loss improves always_save_checkpoint = True #n_layer = 6 #n_head = 6 #n_embd = 384 #dropout = 0.2 block_size = 1024 # gpt2-large #n_layer = 36 #n_head = 20 #n_embd = 1280 #dropout = 0.2 # gpt2-medium n_layer = 24 n_head = 16 n_embd = 1024 dropout = 0.2 # the number of examples per iter: # 1 batch_size * 32 grad_accum * 1024 tokens = 32,768 tokens/iter # shakespeare has 301,966 tokens, so 1 epoch ~= 9.2 iters # stackoverflow has 10,495,518,108 tokens # openwebtext has 9,035,582,489 tokens # funcom_raw has 8,752,695,577 tokens # fundats_srcml has 48,774,749,459 tokens batch_size = 4 gradient_accumulation_steps = 4 max_iters = 372122 * 10 # finetune at constant LR learning_rate = 3e-5 decay_lr = False #weight_decay = 1e-1