|
NEW RUN 2023-02-16-13-05-33 |
|
{'data_order': '', 'load_model': 'out/rwkv-init.pth', 'wandb': '', 'proj_dir': 'out', 'random_seed': -1, 'data_file': '../data/tho.xyz', 'data_type': 'tho', 'vocab_size': 2944, 'ctx_len': 256, 'epoch_steps': 2000, 'epoch_count': 50, 'epoch_begin': 0, 'epoch_save': 2, 'micro_bsz': 128, 'n_layer': 6, 'n_embd': 640, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0008, 'lr_final': 1e-05, 'warmup_steps': 0, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-08, 'grad_cp': 0, 'my_pile_stage': 0, 'my_pile_shift': -1, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 200, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 0, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'ddp_find_unused_parameters_false', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2023-02-16-13-05-33', 'betas': (0.9, 0.99), 'real_bsz': 128, 'run_name': '0 ctx256 L6 D640'} |
|
NEW RUN 2023-02-16-13-08-39 |
|
{'data_order': '', 'load_model': 'out/rwkv-init.pth', 'wandb': '', 'proj_dir': 'out', 'random_seed': -1, 'data_file': '../data/tho.xyz', 'data_type': 'tho', 'vocab_size': 2944, 'ctx_len': 256, 'epoch_steps': 2000, 'epoch_count': 50, 'epoch_begin': 0, 'epoch_save': 2, 'micro_bsz': 96, 'n_layer': 6, 'n_embd': 640, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0008, 'lr_final': 1e-05, 'warmup_steps': 0, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-08, 'grad_cp': 0, 'my_pile_stage': 0, 'my_pile_shift': -1, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 200, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 0, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'ddp_find_unused_parameters_false', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2023-02-16-13-08-39', 'betas': (0.9, 0.99), 'real_bsz': 96, 'run_name': '0 ctx256 L6 D640'} |
|
0 1.093594 2.9850 0.00073287 2023-02-16 13:25:17.745074 0 |
|
1 0.941147 2.5629 0.00067138 2023-02-16 13:40:53.883233 1 |
|
2 0.892939 2.4423 0.00061504 2023-02-16 13:56:31.146043 2 |
|
3 0.859685 2.3624 0.00056343 2023-02-16 14:12:06.850190 3 |
|
4 0.832955 2.3001 0.00051616 2023-02-16 14:27:42.928238 4 |
|
5 0.810063 2.2481 0.00047285 2023-02-16 14:43:18.798455 5 |
|
6 0.786324 2.1953 0.00043317 2023-02-16 14:58:54.592263 6 |
|
7 0.765332 2.1497 0.00039682 2023-02-16 15:14:30.481214 7 |
|
8 0.743950 2.1042 0.00036352 2023-02-16 15:30:06.261186 8 |
|
9 0.727221 2.0693 0.00033302 2023-02-16 15:45:42.025803 9 |
|
10 0.709156 2.0323 0.00030508 2023-02-16 16:01:18.205278 10 |
|
11 0.693011 1.9997 0.00027948 2023-02-16 16:16:54.525035 11 |
|
12 0.675655 1.9653 0.00025603 2023-02-16 16:32:30.596184 12 |
|
13 0.661223 1.9372 0.00023454 2023-02-16 16:48:06.249496 13 |
|
14 0.646881 1.9096 0.00021486 2023-02-16 17:03:42.588712 14 |
|
15 0.634456 1.8860 0.00019683 2023-02-16 17:19:18.364507 15 |
|
16 0.622414 1.8634 0.00018032 2023-02-16 17:34:54.026329 16 |
|
17 0.610098 1.8406 0.00016519 2023-02-16 17:50:29.915238 17 |
|
18 0.598507 1.8194 0.00015133 2023-02-16 18:06:05.689649 18 |
|
19 0.587040 1.7987 0.00013863 2023-02-16 18:21:41.369287 19 |
|
20 0.576851 1.7804 0.00012700 2023-02-16 18:37:17.057368 20 |
|
NEW RUN 2023-02-16-18-47-31 |
|
{'data_order': '', 'load_model': 'out/rwkv-20.pth', 'wandb': '', 'proj_dir': 'out', 'random_seed': -1, 'data_file': '../data/tho.xyz', 'data_type': 'tho', 'vocab_size': 2944, 'ctx_len': 256, 'epoch_steps': 2000, 'epoch_count': 50, 'epoch_begin': 0, 'epoch_save': 2, 'micro_bsz': 96, 'n_layer': 6, 'n_embd': 640, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0003, 'lr_final': 1e-05, 'warmup_steps': 0, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-08, 'grad_cp': 0, 'my_pile_stage': 0, 'my_pile_shift': -1, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 200, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 0, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'ddp_find_unused_parameters_false', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2023-02-16-18-47-31', 'betas': (0.9, 0.99), 'real_bsz': 96, 'run_name': '0 ctx256 L6 D640'} |
|
NEW RUN 2023-02-16-18-52-26 |
|
{'data_order': '', 'load_model': 'out/rwkv-20.pth', 'wandb': '', 'proj_dir': 'out', 'random_seed': -1, 'data_file': '../data/tho.xyz', 'data_type': 'tho', 'vocab_size': 2944, 'ctx_len': 256, 'epoch_steps': 2000, 'epoch_count': 50, 'epoch_begin': 0, 'epoch_save': 2, 'micro_bsz': 96, 'n_layer': 6, 'n_embd': 640, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0003, 'lr_final': 1e-05, 'warmup_steps': 0, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-08, 'grad_cp': 0, 'my_pile_stage': 0, 'my_pile_shift': -1, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 200, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 0, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'ddp_find_unused_parameters_false', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2023-02-16-18-52-26', 'betas': (0.9, 0.99), 'real_bsz': 96, 'run_name': '0 ctx256 L6 D640'} |
|
NEW RUN 2023-02-16-19-25-39 |
|
{'data_order': '', 'load_model': 'out/rwkv-20.pth', 'wandb': '', 'proj_dir': 'out', 'random_seed': -1, 'data_file': '../data/tho.xyz', 'data_type': 'tho', 'vocab_size': 2944, 'ctx_len': 256, 'epoch_steps': 2000, 'epoch_count': 50, 'epoch_begin': 0, 'epoch_save': 2, 'micro_bsz': 92, 'n_layer': 6, 'n_embd': 640, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0003, 'lr_final': 1e-05, 'warmup_steps': 0, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-08, 'grad_cp': 0, 'my_pile_stage': 0, 'my_pile_shift': -1, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 200, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 0, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'ddp_find_unused_parameters_false', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2023-02-16-19-25-39', 'betas': (0.9, 0.99), 'real_bsz': 92, 'run_name': '0 ctx256 L6 D640'} |
|
0 0.754426 2.1264 0.00028027 2023-02-16 19:41:46.934997 0 |
|
1 0.720264 2.0550 0.00026184 2023-02-16 19:56:54.597725 1 |
|
2 0.707874 2.0297 0.00024462 2023-02-16 20:12:02.334430 2 |
|
3 0.695020 2.0037 0.00022853 2023-02-16 20:27:09.673654 3 |
|
4 0.681938 1.9777 0.00021351 2023-02-16 20:42:16.989615 4 |
|
5 0.668078 1.9505 0.00019947 2023-02-16 20:57:24.504061 5 |
|
6 0.658107 1.9311 0.00018635 2023-02-16 21:12:31.772208 6 |
|
7 0.645134 1.9062 0.00017409 2023-02-16 21:27:38.954502 7 |
|
8 0.635747 1.8884 0.00016264 2023-02-16 21:42:46.203324 8 |
|
9 0.625085 1.8684 0.00015195 2023-02-16 21:57:53.333285 9 |
|
10 0.612870 1.8457 0.00014196 2023-02-16 22:13:00.742840 10 |
|
11 0.604478 1.8303 0.00013262 2023-02-16 22:28:07.912322 11 |
|
12 0.595785 1.8145 0.00012390 2023-02-16 22:43:15.222474 12 |
|
13 0.586514 1.7977 0.00011575 2023-02-16 22:58:22.868310 13 |
|
14 0.578608 1.7836 0.00010814 2023-02-16 23:13:30.149384 14 |
|
15 0.569837 1.7680 0.00010103 2023-02-16 23:28:37.174876 15 |
|
16 0.561558 1.7534 0.00009438 2023-02-16 23:43:44.518777 16 |
|
17 0.555774 1.7433 0.00008818 2023-02-16 23:58:51.274291 17 |
|
18 0.547252 1.7285 0.00008238 2023-02-17 00:13:58.978149 18 |
|
19 0.541129 1.7179 0.00007696 2023-02-17 00:29:05.785775 19 |
|
20 0.534734 1.7070 0.00007190 2023-02-17 00:44:13.309628 20 |
|
21 0.529624 1.6983 0.00006717 2023-02-17 00:59:20.049124 21 |
|
22 0.523605 1.6881 0.00006275 2023-02-17 01:14:27.665985 22 |
|
23 0.518558 1.6796 0.00005863 2023-02-17 01:29:34.954975 23 |
|
24 0.514036 1.6720 0.00005477 2023-02-17 01:44:42.485682 24 |
|
25 0.508343 1.6625 0.00005117 2023-02-17 01:59:49.942508 25 |
|
26 0.504371 1.6559 0.00004781 2023-02-17 02:14:57.506359 26 |
|
27 0.499683 1.6482 0.00004466 2023-02-17 02:30:04.960042 27 |
|
28 0.495946 1.6421 0.00004172 2023-02-17 02:45:12.622386 28 |
|
29 0.493503 1.6380 0.00003898 2023-02-17 03:00:20.222849 29 |
|
30 0.488671 1.6301 0.00003642 2023-02-17 03:15:27.522774 30 |
|
31 0.484929 1.6241 0.00003402 2023-02-17 03:30:34.836139 31 |
|
32 0.482365 1.6199 0.00003178 2023-02-17 03:45:42.164712 32 |
|
33 0.479846 1.6158 0.00002969 2023-02-17 04:00:49.572241 33 |
|
34 0.477990 1.6128 0.00002774 2023-02-17 04:15:56.959704 34 |
|
35 0.473870 1.6062 0.00002592 2023-02-17 04:31:04.509452 35 |
|
36 0.470400 1.6006 0.00002421 2023-02-17 04:46:11.953945 36 |
|
37 0.468071 1.5969 0.00002262 2023-02-17 05:01:19.613997 37 |
|
38 0.465944 1.5935 0.00002113 2023-02-17 05:16:26.828705 38 |
|
39 0.465499 1.5928 0.00001974 2023-02-17 05:31:34.384875 39 |
|
40 0.462647 1.5883 0.00001845 2023-02-17 05:46:41.646224 40 |
|
41 0.460343 1.5846 0.00001723 2023-02-17 06:01:49.749365 41 |
|
42 0.459188 1.5828 0.00001610 2023-02-17 06:16:56.897575 42 |
|
43 0.457163 1.5796 0.00001504 2023-02-17 06:32:06.085611 43 |
|
44 0.456941 1.5792 0.00001405 2023-02-17 06:47:13.988648 44 |
|
NEW RUN 2023-02-17-06-49-04 |
|
{'data_order': '', 'load_model': 'out/rwkv-20.pth', 'wandb': '', 'proj_dir': 'out', 'random_seed': -1, 'data_file': '../data/tho.xyz', 'data_type': 'tho', 'vocab_size': 2944, 'ctx_len': 256, 'epoch_steps': 2000, 'epoch_count': 50, 'epoch_begin': 0, 'epoch_save': 2, 'micro_bsz': 92, 'n_layer': 6, 'n_embd': 640, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0003, 'lr_final': 1e-05, 'warmup_steps': 0, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-08, 'grad_cp': 0, 'my_pile_stage': 0, 'my_pile_shift': -1, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 200, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 0, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'ddp_find_unused_parameters_false', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2023-02-17-06-49-04', 'betas': (0.9, 0.99), 'real_bsz': 92, 'run_name': '0 ctx256 L6 D640'} |
|
0 0.504356 1.6559 0.00028027 2023-02-17 07:05:17.510254 0 |
|
NEW RUN 2023-02-17-07-18-02 |
|
{'data_order': '', 'load_model': 'out/rwkv-44.pth', 'wandb': '', 'proj_dir': 'out', 'random_seed': -1, 'data_file': '../data/tho.xyz', 'data_type': 'tho', 'vocab_size': 2944, 'ctx_len': 256, 'epoch_steps': 2000, 'epoch_count': 50, 'epoch_begin': 0, 'epoch_save': 2, 'micro_bsz': 92, 'n_layer': 6, 'n_embd': 640, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 4e-05, 'lr_final': 1e-05, 'warmup_steps': 0, 'beta1': 0.9, 'beta2': 0.99, 'adam_eps': 1e-08, 'grad_cp': 0, 'my_pile_stage': 0, 'my_pile_shift': -1, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 200, 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 0, 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'ddp_find_unused_parameters_false', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2023-02-17-07-18-02', 'betas': (0.9, 0.99), 'real_bsz': 92, 'run_name': '0 ctx256 L6 D640'} |
|
0 0.374237 1.4539 0.00003891 2023-02-17 07:34:15.065775 0 |
|
1 0.374224 1.4539 0.00003784 2023-02-17 07:49:24.630882 1 |
|
2 0.373372 1.4526 0.00003681 2023-02-17 08:04:33.745072 2 |
|
3 0.372406 1.4512 0.00003580 2023-02-17 08:19:43.048645 3 |
|
4 0.370914 1.4491 0.00003482 2023-02-17 08:34:51.912647 4 |
|
5 0.369673 1.4473 0.00003387 2023-02-17 08:50:01.437347 5 |
|
6 0.368872 1.4461 0.00003294 2023-02-17 09:05:10.602278 6 |
|
7 0.365740 1.4416 0.00003204 2023-02-17 09:20:24.196625 7 |
|
8 0.365551 1.4413 0.00003117 2023-02-17 09:35:35.185035 8 |
|
9 0.364295 1.4395 0.00003031 2023-02-17 09:50:44.481306 9 |
|
10 0.363085 1.4378 0.00002949 2023-02-17 10:05:54.004254 10 |
|
11 0.362373 1.4367 0.00002868 2023-02-17 10:21:03.700808 11 |
|
12 0.360404 1.4339 0.00002789 2023-02-17 10:36:12.829567 12 |
|
13 0.358833 1.4317 0.00002713 2023-02-17 10:51:22.632951 13 |
|
14 0.357407 1.4296 0.00002639 2023-02-17 11:06:31.992598 14 |
|
15 0.356000 1.4276 0.00002567 2023-02-17 11:21:43.433903 15 |
|
16 0.354691 1.4257 0.00002497 2023-02-17 11:36:57.306886 16 |
|
|