htlou commited on
Commit
24f69f0
·
verified ·
1 Parent(s): ca154a7

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ wandb/run-20241025_181518-qbvp2oju/run-qbvp2oju.wandb filter=lfs diff=lfs merge=lfs -text
arguments.yaml ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bnb_cfgs:
2
+ bnb_4bit_compute_dtype: float16
3
+ bnb_4bit_quant_type: nf4
4
+ bnb_4bit_use_double_quant: true
5
+ load_in_4bit: true
6
+ load_in_8bit: false
7
+ use_bnb: false
8
+ data_cfgs:
9
+ eval_data_files: null
10
+ eval_datasets: null
11
+ eval_optional_args: []
12
+ eval_size: null
13
+ eval_split: null
14
+ eval_subset: null
15
+ eval_template: null
16
+ ptx_data_files: ti2ti_ptx_27k.pt
17
+ ptx_datasets: null
18
+ ptx_optional_args: []
19
+ ptx_size: null
20
+ ptx_split: null
21
+ ptx_subset: null
22
+ ptx_template: spavl_ti2ti
23
+ train_data_files: ti2ti_llf_prompt_only_tokenize.pt
24
+ train_datasets: /data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs
25
+ train_optional_args: []
26
+ train_size: 5000
27
+ train_split: null
28
+ train_subset: null
29
+ train_template: spavl_ti2ti
30
+ logger_cfgs:
31
+ cache_dir: null
32
+ log_project: align-anything
33
+ log_run_name: ppo
34
+ log_type: wandb
35
+ output_dir: ../outputs/ppo_ti2ti_baseline_1025_with_eval
36
+ save_interval: 30.0
37
+ lora_cfgs:
38
+ inference_mode: false
39
+ lora_alpha: 16
40
+ lora_dropout: 0.1
41
+ r: 16
42
+ save_full_model: true
43
+ target_modules:
44
+ - q_proj
45
+ - v_proj
46
+ task_type: TaskType.CAUSAL_LM
47
+ use_lora: false
48
+ model_cfgs:
49
+ actor_model_name_or_path: /data/align-anything/hantao/models/0916_ti_to_ti_sft
50
+ model_max_length: 2048
51
+ repetition_penalty: 1.0
52
+ reward_critic_model_name_or_path: /data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400
53
+ reward_model_name_or_path: /data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400
54
+ temperature: 1.0
55
+ top_p: 1.0
56
+ trust_remote_code: true
57
+ special_tokens: null
58
+ train_cfgs:
59
+ actor_gradient_checkpointing: true
60
+ actor_lr: 1.0e-05
61
+ actor_lr_scheduler_type: cosine
62
+ actor_lr_warmup_ratio: 0.03
63
+ actor_weight_decay: 0.01
64
+ adam_betas:
65
+ - 0.9
66
+ - 0.95
67
+ bf16: true
68
+ clip_range_ratio: 0.2
69
+ clip_range_score: 50.0
70
+ clip_range_value: 5.0
71
+ critic_gradient_checkpointing: true
72
+ critic_lr: 5.0e-06
73
+ critic_lr_scheduler_type: constant
74
+ critic_lr_warmup_ratio: 0.03
75
+ critic_weight_decay: 0.0
76
+ ds_cfgs: ds_z3_config.json
77
+ epochs: 3
78
+ eval_interval: 10
79
+ eval_strategy: epoch
80
+ fp16: false
81
+ freeze_language_model: true
82
+ freeze_mm_proj: true
83
+ freeze_vision_tower: false
84
+ gae_lambda: 0.95
85
+ gamma: 1.0
86
+ gradient_accumulation_steps: 2
87
+ kl_coeff: 0.02
88
+ normalize_reward: false
89
+ per_device_eval_batch_size: 8
90
+ per_device_prompt_batch_size: 8
91
+ per_device_train_batch_size: 8
92
+ ptx_coeff: 16.0
93
+ seed: 42
94
+ update_iters: 1
config.json ADDED
The diff for this file is too large to render. See raw diff
 
environ.txt ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CONDA_DEFAULT_ENV=hantao_cham
2
+ CONDA_EXE=/home/align-anything/miniconda3/bin/conda
3
+ CONDA_PREFIX=/home/align-anything/miniconda3/envs/hantao_cham
4
+ CONDA_PREFIX_1=/home/align-anything/miniconda3
5
+ CONDA_PROMPT_MODIFIER=(hantao_cham)
6
+ CONDA_PYTHON_EXE=/home/align-anything/miniconda3/bin/python
7
+ CONDA_SHLVL=2
8
+ CRASHDIR=/etc/ShellCrash
9
+ CROSS_RANK=0
10
+ CROSS_SIZE=1
11
+ CUDA_MODULE_LOADING=LAZY
12
+ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
13
+ DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/2000/bus
14
+ HOME=/home/align-anything
15
+ LANG=en_US.UTF-8
16
+ LD_LIBRARY_PATH=/home/align-anything/miniconda3/envs/hantao_cham/lib/python3.11/site-packages/cv2/../../lib64:
17
+ LESSCLOSE=/usr/bin/lesspipe %s %s
18
+ LESSOPEN=| /usr/bin/lesspipe %s
19
+ LOCAL_RANK=0
20
+ LOCAL_SIZE=8
21
+ LOGLEVEL=WARNING
22
+ LOGNAME=align-anything
23
+ LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:
24
+ MASTER_ADDR=127.0.0.1
25
+ MASTER_PORT=63303
26
+ MOTD_SHOWN=pam
27
+ OLDPWD=/data/align-anything/hantao/align-anything/projects/text_image_to_text_image
28
+ PATH=/home/align-anything/miniconda3/envs/hantao_cham/bin:/home/align-anything/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
29
+ PWD=/data/align-anything/hantao/align-anything/scripts
30
+ PYGAME_HIDE_SUPPORT_PROMPT=1
31
+ PYTHONHASHSEED=42
32
+ PYTHONPATH=/data/align-anything/hantao/align-anything
33
+ QT_QPA_FONTDIR=/home/align-anything/miniconda3/envs/hantao_cham/lib/python3.11/site-packages/cv2/qt/fonts
34
+ QT_QPA_PLATFORM_PLUGIN_PATH=/home/align-anything/miniconda3/envs/hantao_cham/lib/python3.11/site-packages/cv2/qt/plugins
35
+ RANK=0
36
+ SHELL=/bin/bash
37
+ SHLVL=3
38
+ SSH_CLIENT=111.205.230.212 28724 30500
39
+ SSH_CONNECTION=111.205.230.212 62683 10.10.212.195 30500
40
+ SSH_TTY=/dev/pts/2
41
+ TERM=screen
42
+ TMUX=/tmp/tmux-2000/default,90929,6
43
+ TMUX_PANE=%6
44
+ USER=align-anything
45
+ WANDB_API_KEY=7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33
46
+ WANDB_MODE=online
47
+ WANDB_SERVICE=2-937440-tcp-localhost-44607
48
+ WORLD_SIZE=8
49
+ XDG_DATA_DIRS=/usr/local/share:/usr/share:/var/lib/snapd/desktop
50
+ XDG_RUNTIME_DIR=/run/user/2000
51
+ XDG_SESSION_CLASS=user
52
+ XDG_SESSION_ID=4
53
+ XDG_SESSION_TYPE=tty
54
+ _=/home/align-anything/miniconda3/envs/hantao_cham/bin/deepspeed
55
+ _CE_CONDA=
56
+ _CE_M=
preprocessor_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 512,
4
+ "width": 512
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "image_mean": [
12
+ 1.0,
13
+ 1.0,
14
+ 1.0
15
+ ],
16
+ "image_processor_type": "ChameleonImageProcessor",
17
+ "image_std": [
18
+ 1.0,
19
+ 1.0,
20
+ 1.0
21
+ ],
22
+ "processor_class": "ChameleonProcessor",
23
+ "resample": 1,
24
+ "rescale_factor": 0.0078,
25
+ "size": {
26
+ "shortest_edge": 512
27
+ }
28
+ }
processor_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "image_seq_length": 1024,
3
+ "image_token": "<image>",
4
+ "processor_class": "ChameleonProcessor"
5
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c201d7f9317f729765675c0940a46cd4b1675dd9ba6d5b9b5da3cdafb564faa
3
+ size 14165009930
script.sh ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # Copyright 2024 PKU-Alignment Team. All Rights Reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ # ==============================================================================
17
+
18
+ # Initialize variables
19
+ # For wandb online logging
20
+ export WANDB_API_KEY="7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33"
21
+ # Source the setup script
22
+ # source ./setup.sh
23
+
24
+ export WANDB_MODE=online
25
+
26
+ ACTOR_MODEL_NAME_OR_PATH="/data/align-anything/hantao/models/0916_ti_to_ti_sft"
27
+ CRITIC_MODEL_NAME_OR_PATH="/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400"
28
+ REWARD_MODEL_NAME_OR_PATH="/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400"
29
+ TRAIN_DATASETS="/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs"
30
+ PTX_DATASETS="/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs"
31
+ OUTPUT_DIR="../outputs/ppo_ti2ti_baseline_1025_with_eval"
32
+
33
+ # Source the setup script
34
+ source ./setup.sh
35
+
36
+ # Execute deepspeed command
37
+ deepspeed \
38
+ --master_port ${MASTER_PORT} \
39
+ --module align_anything.trainers.text_image_to_text_image.ppo \
40
+ --actor_model_name_or_path ${ACTOR_MODEL_NAME_OR_PATH} \
41
+ --reward_model_name_or_path ${REWARD_MODEL_NAME_OR_PATH} \
42
+ --reward_critic_model_name_or_path ${CRITIC_MODEL_NAME_OR_PATH} \
43
+ --train_datasets ${TRAIN_DATASETS} \
44
+ --train_template spavl_ti2ti \
45
+ --train_data_files ti2ti_llf_prompt_only_tokenize.pt \
46
+ --ptx_template spavl_ti2ti \
47
+ --ptx_data_files ti2ti_ptx_27k.pt \
48
+ --output_dir ${OUTPUT_DIR} \
49
+ --save_interval 30
50
+
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "<reserved08706>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "<unk>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
wandb/debug-internal.log ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-10-25T18:15:18.987617848Z","level":"INFO","msg":"using version","core version":"0.18.3"}
2
+ {"time":"2024-10-25T18:15:18.987649473Z","level":"INFO","msg":"created symlink","path":"../outputs/ppo_ti2ti_baseline_1025_with_eval/wandb/run-20241025_181518-qbvp2oju/logs/debug-core.log"}
3
+ {"time":"2024-10-25T18:15:18.991400712Z","level":"ERROR","msg":"dialing: google: could not find default credentials. See https://cloud.google.com/docs/authentication/external/set-up-adc for more information"}
4
+ {"time":"2024-10-25T18:15:19.015335603Z","level":"INFO","msg":"created new stream","id":"qbvp2oju"}
5
+ {"time":"2024-10-25T18:15:19.015397376Z","level":"INFO","msg":"stream: started","id":"qbvp2oju"}
6
+ {"time":"2024-10-25T18:15:19.015408377Z","level":"INFO","msg":"handler: started","stream_id":{"value":"qbvp2oju"}}
7
+ {"time":"2024-10-25T18:15:19.015432033Z","level":"INFO","msg":"sender: started","stream_id":{"value":"qbvp2oju"}}
8
+ {"time":"2024-10-25T18:15:19.015437112Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"qbvp2oju"}}
9
+ {"time":"2024-10-25T18:15:20.634593869Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
10
+ {"time":"2024-10-25T18:15:20.637814914Z","level":"INFO","msg":"Starting system monitor"}
11
+ {"time":"2024-10-25T19:38:13.593466266Z","level":"INFO","msg":"Stopping system monitor"}
12
+ {"time":"2024-10-25T19:38:13.627014655Z","level":"INFO","msg":"Stopped system monitor"}
13
+ {"time":"2024-10-25T19:38:14.559855674Z","level":"WARN","msg":"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"}
14
+ {"time":"2024-10-25T19:38:14.559906183Z","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
15
+ {"time":"2024-10-25T19:38:15.545457735Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
16
+ {"time":"2024-10-25T19:38:17.12240115Z","level":"INFO","msg":"stream: closing","id":"qbvp2oju"}
17
+ {"time":"2024-10-25T19:38:17.12243525Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"qbvp2oju"}}
18
+ {"time":"2024-10-25T19:38:17.122460489Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"qbvp2oju"}}
19
+ {"time":"2024-10-25T19:38:17.122575437Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"qbvp2oju"}}
20
+ {"time":"2024-10-25T19:38:17.124870943Z","level":"INFO","msg":"stream: closed","id":"qbvp2oju"}
wandb/debug.log ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Current SDK version is 0.18.3
2
+ 2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Configure stats pid to 937440
3
+ 2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Loading settings from /home/align-anything/.config/wandb/settings
4
+ 2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Loading settings from /data/align-anything/hantao/align-anything/scripts/wandb/settings
5
+ 2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'api_key': '***REDACTED***', 'mode': 'online'}
6
+ 2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
7
+ 2024-10-25 18:15:18,977 WARNING MainThread:937440 [wandb_setup.py:_flush():79] Could not find program at -m align_anything.trainers.text_image_to_text_image.ppo
8
+ 2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m align_anything.trainers.text_image_to_text_image.ppo'}
9
+ 2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Applying login settings: {}
10
+ 2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_init.py:_log_setup():532] Logging user logs to ../outputs/ppo_ti2ti_baseline_1025_with_eval/wandb/run-20241025_181518-qbvp2oju/logs/debug.log
11
+ 2024-10-25 18:15:18,978 INFO MainThread:937440 [wandb_init.py:_log_setup():533] Logging internal logs to ../outputs/ppo_ti2ti_baseline_1025_with_eval/wandb/run-20241025_181518-qbvp2oju/logs/debug-internal.log
12
+ 2024-10-25 18:15:18,978 INFO MainThread:937440 [wandb_init.py:init():617] calling init triggers
13
+ 2024-10-25 18:15:18,978 INFO MainThread:937440 [wandb_init.py:init():624] wandb.init called with sweep_config: {}
14
+ config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_prompt_batch_size': 8, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 2, 'actor_gradient_checkpointing': True, 'critic_gradient_checkpointing': True, 'actor_lr': 1e-05, 'actor_lr_scheduler_type': 'cosine', 'actor_lr_warmup_ratio': 0.03, 'actor_weight_decay': 0.01, 'critic_lr': 5e-06, 'critic_lr_scheduler_type': 'constant', 'critic_lr_warmup_ratio': 0.03, 'critic_weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'kl_coeff': 0.02, 'clip_range_ratio': 0.2, 'clip_range_score': 50.0, 'clip_range_value': 5.0, 'ptx_coeff': 16.0, 'gamma': 1.0, 'gae_lambda': 0.95, 'normalize_reward': False, 'update_iters': 1, 'freeze_mm_proj': True, 'freeze_vision_tower': False, 'freeze_language_model': True}, 'data_cfgs': {'train_datasets': '/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs', 'train_template': 'spavl_ti2ti', 'train_size': 5000, 'train_split': None, 'train_subset': None, 'train_data_files': 'ti2ti_llf_prompt_only_tokenize.pt', 'train_optional_args': [], 'eval_datasets': None, 'eval_template': None, 'eval_size': None, 'eval_split': None, 'eval_subset': None, 'eval_data_files': None, 'eval_optional_args': [], 'ptx_datasets': None, 'ptx_template': 'spavl_ti2ti', 'ptx_size': None, 'ptx_subset': None, 'ptx_split': None, 'ptx_data_files': 'ti2ti_ptx_27k.pt', 'ptx_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'ppo', 'output_dir': '../outputs/ppo_ti2ti_baseline_1025_with_eval', 'cache_dir': None, 'save_interval': 30.0}, 'model_cfgs': {'actor_model_name_or_path': '/data/align-anything/hantao/models/0916_ti_to_ti_sft', 'reward_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400', 'reward_critic_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400', 'trust_remote_code': True, 'model_max_length': 2048, 'temperature': 1.0, 'top_p': 1.0, 'repetition_penalty': 1.0}, 'lora_cfgs': {'use_lora': False, 'task_type': 'TaskType.CAUSAL_LM', 'inference_mode': False, 'r': 16, 'lora_alpha': 16, 'lora_dropout': 0.1, 'target_modules': ['q_proj', 'v_proj'], 'save_full_model': True}, 'bnb_cfgs': {'use_bnb': False, 'load_in_4bit': True, 'load_in_8bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'float16'}, 'special_tokens': None}
15
+ 2024-10-25 18:15:18,978 INFO MainThread:937440 [wandb_init.py:init():667] starting backend
16
+ 2024-10-25 18:15:18,978 INFO MainThread:937440 [wandb_init.py:init():671] sending inform_init request
17
+ 2024-10-25 18:15:18,982 INFO MainThread:937440 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
18
+ 2024-10-25 18:15:18,983 INFO MainThread:937440 [wandb_init.py:init():684] backend started and connected
19
+ 2024-10-25 18:15:18,986 INFO MainThread:937440 [wandb_init.py:init():779] updated telemetry
20
+ 2024-10-25 18:15:18,996 INFO MainThread:937440 [wandb_init.py:init():812] communicating run to backend with 90.0 second timeout
21
+ 2024-10-25 18:15:20,628 INFO MainThread:937440 [wandb_init.py:init():863] starting run threads in backend
22
+ 2024-10-25 18:15:20,774 INFO MainThread:937440 [wandb_run.py:_console_start():2465] atexit reg
23
+ 2024-10-25 18:15:20,774 INFO MainThread:937440 [wandb_run.py:_redirect():2313] redirect: wrap_raw
24
+ 2024-10-25 18:15:20,774 INFO MainThread:937440 [wandb_run.py:_redirect():2378] Wrapping output streams.
25
+ 2024-10-25 18:15:20,774 INFO MainThread:937440 [wandb_run.py:_redirect():2403] Redirects installed.
26
+ 2024-10-25 18:15:20,776 INFO MainThread:937440 [wandb_init.py:init():907] run started, returning control to user process
27
+ 2024-10-25 19:38:13,587 INFO MainThread:937440 [wandb_run.py:_finish():2164] finishing run htlou/align-anything/qbvp2oju
28
+ 2024-10-25 19:38:13,590 INFO MainThread:937440 [wandb_run.py:_atexit_cleanup():2428] got exitcode: 0
29
+ 2024-10-25 19:38:13,591 INFO MainThread:937440 [wandb_run.py:_restore():2410] restore
30
+ 2024-10-25 19:38:13,592 INFO MainThread:937440 [wandb_run.py:_restore():2416] restore done
31
+ 2024-10-25 19:38:17,104 INFO MainThread:937440 [wandb_run.py:_footer_history_summary_info():4049] rendering history
32
+ 2024-10-25 19:38:17,107 INFO MainThread:937440 [wandb_run.py:_footer_history_summary_info():4081] rendering summary
33
+ 2024-10-25 19:38:17,119 INFO MainThread:937440 [wandb_run.py:_footer_sync_info():4008] logging synced files
wandb/run-20241025_180620-eoegk43l/files/output.log ADDED
File without changes
wandb/run-20241025_180620-eoegk43l/files/requirements.txt ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ align-anything==0.0.1.dev0
2
+ torch==2.4.0
3
+ pycparser==2.22
4
+ torchvision==0.19.0
5
+ multiprocess==0.70.16
6
+ braceexpand==0.1.7
7
+ lm-format-enforcer==0.10.6
8
+ Jinja2==3.1.4
9
+ scikit-learn==1.5.2
10
+ interegular==0.3.3
11
+ starlette==0.38.6
12
+ huggingface-hub==0.25.2
13
+ pyairports==2.1.1
14
+ protobuf==3.20.3
15
+ term-image==0.7.2
16
+ python-dateutil==2.9.0.post0
17
+ identify==2.6.1
18
+ tokenizers==0.19.1
19
+ tensorboard-data-server==0.7.2
20
+ numba==0.60.0
21
+ ninja==1.11.1.1
22
+ nvidia-cuda-cupti-cu12==12.1.105
23
+ diskcache==5.6.3
24
+ pycountry==24.6.1
25
+ py-cpuinfo==9.0.0
26
+ scipy==1.14.1
27
+ soxr==0.5.0.post1
28
+ prometheus-fastapi-instrumentator==7.0.0
29
+ align-anything==0.0.1.dev0
30
+ virtualenv==20.26.6
31
+ hjson==3.1.0
32
+ nvidia-cudnn-cu12==9.1.0.70
33
+ termcolor==2.5.0
34
+ grpcio==1.66.2
35
+ wheel==0.44.0
36
+ torchlibrosa==0.1.0
37
+ numpy==1.26.4
38
+ msgpack==1.1.0
39
+ rpds-py==0.20.0
40
+ annotated-types==0.7.0
41
+ pre_commit==4.0.1
42
+ aiohttp==3.10.10
43
+ audioread==3.0.1
44
+ lazy_loader==0.4
45
+ nvidia-cuda-runtime-cu12==12.1.105
46
+ filelock==3.16.1
47
+ timm==0.6.13
48
+ anyio==4.6.0
49
+ pydantic_core==2.23.4
50
+ idna==3.10
51
+ fastapi==0.115.0
52
+ wandb==0.18.3
53
+ packaging==24.1
54
+ yt-dlp==2024.8.6
55
+ matplotlib==3.9.2
56
+ websockets==12.0
57
+ triton==3.0.0
58
+ zipp==3.20.2
59
+ requests==2.32.3
60
+ xxhash==3.5.0
61
+ image-reward==1.5
62
+ pytorch-fid==0.3.0
63
+ imageio-ffmpeg==0.5.1
64
+ args==0.1.0
65
+ llvmlite==0.43.0
66
+ peft==0.13.2
67
+ openai==1.51.2
68
+ httpx==0.27.2
69
+ nvidia-cublas-cu12==12.1.3.1
70
+ pytest-split==0.8.0
71
+ ruff==0.6.9
72
+ sniffio==1.3.1
73
+ yarl==1.15.0
74
+ pandas==2.2.3
75
+ fsspec==2024.6.1
76
+ gguf==0.10.0
77
+ diffusers==0.30.3
78
+ platformdirs==4.3.6
79
+ nvidia-cuda-nvrtc-cu12==12.1.105
80
+ imageio==2.35.1
81
+ Brotli==1.1.0
82
+ bitsandbytes==0.44.1
83
+ hpsv2==1.2.0
84
+ lark==1.2.2
85
+ gradio==5.0.2
86
+ pydantic==2.9.2
87
+ pytz==2024.2
88
+ jsonschema-specifications==2024.10.1
89
+ deepspeed==0.15.2
90
+ cloudpickle==3.1.0
91
+ distro==1.9.0
92
+ aiohappyeyeballs==2.4.3
93
+ Markdown==3.7
94
+ docker-pycreds==0.4.0
95
+ semantic-version==2.10.0
96
+ resampy==0.4.3
97
+ urllib3==2.2.3
98
+ nodeenv==1.9.1
99
+ click==8.1.7
100
+ accelerate==1.0.1
101
+ dill==0.3.8
102
+ setproctitle==1.3.3
103
+ httpcore==1.0.6
104
+ pooch==1.8.2
105
+ importlib_metadata==8.5.0
106
+ cfgv==3.4.0
107
+ einops==0.8.0
108
+ shellingham==1.5.4
109
+ pytest==7.2.0
110
+ python-dotenv==1.0.1
111
+ pydub==0.25.1
112
+ kiwisolver==1.4.7
113
+ aiofiles==23.2.1
114
+ vllm==0.6.2
115
+ Werkzeug==3.0.4
116
+ tensorboard==2.18.0
117
+ joblib==1.4.2
118
+ pycryptodomex==3.21.0
119
+ moviepy==1.0.3
120
+ typing_extensions==4.12.2
121
+ mdurl==0.1.2
122
+ mistral_common==1.4.4
123
+ rich==13.9.2
124
+ aiosignal==1.3.1
125
+ mmsg==0.1.dev20+g585c63a.d20241012
126
+ pillow==10.4.0
127
+ prometheus_client==0.21.0
128
+ nvidia-cusolver-cu12==11.4.5.107
129
+ typer==0.12.5
130
+ pyzmq==26.2.0
131
+ h11==0.14.0
132
+ gitdb==4.0.11
133
+ transformers==4.44.0.dev0
134
+ nvidia-nccl-cu12==2.20.5
135
+ jsonschema==4.23.0
136
+ soundfile==0.12.1
137
+ contourpy==1.3.0
138
+ mutagen==1.47.0
139
+ regex==2024.9.11
140
+ orjson==3.10.7
141
+ fairscale==0.4.13
142
+ partial-json-parser==0.2.1.1.post4
143
+ outlines==0.1.1.dev4+ga2fd35c
144
+ nvidia-curand-cu12==10.3.2.106
145
+ pluggy==1.5.0
146
+ GitPython==3.1.43
147
+ tzdata==2024.2
148
+ uvicorn==0.31.1
149
+ sentencepiece==0.2.0
150
+ decorator==4.4.2
151
+ nvidia-nvjitlink-cu12==12.6.77
152
+ distlib==0.3.9
153
+ uvloop==0.20.0
154
+ networkx==3.4.1
155
+ wcwidth==0.2.13
156
+ opencv-python==4.6.0.66
157
+ six==1.16.0
158
+ httptools==0.6.1
159
+ safetensors==0.4.5
160
+ nvidia-nvtx-cu12==12.1.105
161
+ markdown-it-py==3.0.0
162
+ certifi==2024.8.30
163
+ sentry-sdk==2.16.0
164
+ outlines_core==0.1.0
165
+ threadpoolctl==3.5.0
166
+ nvidia-cufft-cu12==11.0.2.54
167
+ datasets==3.0.1
168
+ cycler==0.12.1
169
+ psutil==6.0.0
170
+ nvidia-cusparse-cu12==12.1.0.106
171
+ shortuuid==1.0.13
172
+ ffmpy==0.4.0
173
+ xformers==0.0.27.post2
174
+ MarkupSafe==2.1.5
175
+ tqdm==4.66.5
176
+ gradio_client==1.4.0
177
+ attrs==24.2.0
178
+ optree==0.13.0
179
+ PyYAML==6.0.2
180
+ clint==0.5.1
181
+ torchaudio==2.4.0
182
+ frechet-audio-distance==0.1.2
183
+ frozenlist==1.4.1
184
+ clip==0.2.0
185
+ multidict==6.1.0
186
+ propcache==0.2.0
187
+ librosa==0.10.2.post1
188
+ webdataset==0.2.100
189
+ ray==2.37.0
190
+ pyparsing==3.1.4
191
+ pyarrow==17.0.0
192
+ tiktoken==0.7.0
193
+ watchfiles==0.24.0
194
+ proglog==0.1.10
195
+ cachetools==5.5.0
196
+ fonttools==4.54.1
197
+ charset-normalizer==3.4.0
198
+ ftfy==6.3.0
199
+ referencing==0.35.1
200
+ mpmath==1.3.0
201
+ msgspec==0.18.6
202
+ nvidia-ml-py==12.535.161
203
+ smmap==5.0.1
204
+ absl-py==2.1.0
205
+ python-multipart==0.0.12
206
+ Pygments==2.18.0
207
+ iniconfig==2.0.0
208
+ sympy==1.13.3
209
+ pip==24.2
210
+ airportsdata==20241001
211
+ tomlkit==0.12.0
212
+ nest-asyncio==1.6.0
213
+ setuptools==75.1.0
214
+ jiter==0.6.1
215
+ cffi==1.17.1
216
+ nvitop==1.3.2
217
+ backports.tarfile==1.2.0
218
+ zipp==3.19.2
219
+ inflect==7.3.1
220
+ autocommand==2.2.2
221
+ importlib_resources==6.4.0
222
+ packaging==24.1
223
+ jaraco.context==5.3.0
224
+ typeguard==4.3.0
225
+ more-itertools==10.3.0
226
+ jaraco.text==3.12.1
227
+ platformdirs==4.2.2
228
+ wheel==0.43.0
229
+ typing_extensions==4.12.2
230
+ importlib_metadata==8.0.0
231
+ tomli==2.0.1
232
+ jaraco.collections==5.1.0
233
+ jaraco.functools==4.0.1
wandb/run-20241025_180620-eoegk43l/files/wandb-metadata.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-198-generic-x86_64-with-glibc2.31",
3
+ "python": "3.11.10",
4
+ "startedAt": "2024-10-25T18:06:20.375892Z",
5
+ "args": [
6
+ "--local_rank=0",
7
+ "--actor_model_name_or_path",
8
+ "/data/align-anything/hantao/models/0916_ti_to_ti_sft",
9
+ "--reward_model_name_or_path",
10
+ "/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400",
11
+ "--reward_critic_model_name_or_path",
12
+ "/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400",
13
+ "--train_datasets",
14
+ "/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs",
15
+ "--train_template",
16
+ "spavl_ti2ti",
17
+ "--train_data_files",
18
+ "ti2ti_preference_filtered_tokenize_full.pt",
19
+ "--ptx_template",
20
+ "spavl_ti2ti",
21
+ "--ptx_data_files",
22
+ "ti2ti_ptx_27k.pt",
23
+ "--output_dir",
24
+ "../outputs/ppo_ti2ti_baseline_1025_with_eval",
25
+ "--save_interval",
26
+ "30"
27
+ ],
28
+ "program": "-m align_anything.trainers.text_image_to_text_image.ppo",
29
+ "git": {
30
+ "remote": "https://github.com/PKU-Alignment/align-anything.git",
31
+ "commit": "6fde660afc9985323f147930eedf188a5699adc7"
32
+ },
33
+ "email": "[email protected]",
34
+ "root": "../outputs/ppo_ti2ti_baseline_1025_with_eval",
35
+ "host": "lyg0195",
36
+ "username": "align-anything",
37
+ "executable": "/home/align-anything/miniconda3/envs/hantao_cham/bin/python",
38
+ "cpu_count": 64,
39
+ "cpu_count_logical": 128,
40
+ "gpu": "[NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB]",
41
+ "gpu_count": 8,
42
+ "disk": {
43
+ "/": {
44
+ "total": "938421047296",
45
+ "used": "363102785536"
46
+ }
47
+ },
48
+ "memory": {
49
+ "total": "540647575552"
50
+ },
51
+ "cpu": {
52
+ "count": 64,
53
+ "countLogical": 128
54
+ },
55
+ "gpu_nvidia": [
56
+ {
57
+ "name": "NVIDIA A100-SXM4-80GB",
58
+ "memoryTotal": "85899345920",
59
+ "cudaCores": 6912,
60
+ "architecture": "Ampere"
61
+ },
62
+ {
63
+ "name": "NVIDIA A100-SXM4-80GB",
64
+ "memoryTotal": "85899345920",
65
+ "cudaCores": 6912,
66
+ "architecture": "Ampere"
67
+ },
68
+ {
69
+ "name": "NVIDIA A100-SXM4-80GB",
70
+ "memoryTotal": "85899345920",
71
+ "cudaCores": 6912,
72
+ "architecture": "Ampere"
73
+ },
74
+ {
75
+ "name": "NVIDIA A100-SXM4-80GB",
76
+ "memoryTotal": "85899345920",
77
+ "cudaCores": 6912,
78
+ "architecture": "Ampere"
79
+ },
80
+ {
81
+ "name": "NVIDIA A100-SXM4-80GB",
82
+ "memoryTotal": "85899345920",
83
+ "cudaCores": 6912,
84
+ "architecture": "Ampere"
85
+ },
86
+ {
87
+ "name": "NVIDIA A100-SXM4-80GB",
88
+ "memoryTotal": "85899345920",
89
+ "cudaCores": 6912,
90
+ "architecture": "Ampere"
91
+ },
92
+ {
93
+ "name": "NVIDIA A100-SXM4-80GB",
94
+ "memoryTotal": "85899345920",
95
+ "cudaCores": 6912,
96
+ "architecture": "Ampere"
97
+ },
98
+ {
99
+ "name": "NVIDIA A100-SXM4-80GB",
100
+ "memoryTotal": "85899345920",
101
+ "cudaCores": 6912,
102
+ "architecture": "Ampere"
103
+ }
104
+ ],
105
+ "cudaVersion": "12.4"
106
+ }
wandb/run-20241025_180620-eoegk43l/logs/debug-internal.log ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-10-25T18:06:20.381500476Z","level":"INFO","msg":"using version","core version":"0.18.3"}
2
+ {"time":"2024-10-25T18:06:20.381537039Z","level":"INFO","msg":"created symlink","path":"../outputs/ppo_ti2ti_baseline_1025_with_eval/wandb/run-20241025_180620-eoegk43l/logs/debug-core.log"}
3
+ {"time":"2024-10-25T18:06:20.386238205Z","level":"ERROR","msg":"dialing: google: could not find default credentials. See https://cloud.google.com/docs/authentication/external/set-up-adc for more information"}
4
+ {"time":"2024-10-25T18:06:20.408793678Z","level":"INFO","msg":"created new stream","id":"eoegk43l"}
5
+ {"time":"2024-10-25T18:06:20.408868821Z","level":"INFO","msg":"stream: started","id":"eoegk43l"}
6
+ {"time":"2024-10-25T18:06:20.408926558Z","level":"INFO","msg":"sender: started","stream_id":{"value":"eoegk43l"}}
7
+ {"time":"2024-10-25T18:06:20.408909461Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"eoegk43l"}}
8
+ {"time":"2024-10-25T18:06:20.408921169Z","level":"INFO","msg":"handler: started","stream_id":{"value":"eoegk43l"}}
9
+ {"time":"2024-10-25T18:06:21.029852323Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
10
+ {"time":"2024-10-25T18:06:21.033290741Z","level":"INFO","msg":"Starting system monitor"}
wandb/run-20241025_180620-eoegk43l/logs/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-10-25 18:06:20,368 INFO MainThread:935352 [wandb_setup.py:_flush():79] Current SDK version is 0.18.3
2
+ 2024-10-25 18:06:20,368 INFO MainThread:935352 [wandb_setup.py:_flush():79] Configure stats pid to 935352
3
+ 2024-10-25 18:06:20,369 INFO MainThread:935352 [wandb_setup.py:_flush():79] Loading settings from /home/align-anything/.config/wandb/settings
4
+ 2024-10-25 18:06:20,369 INFO MainThread:935352 [wandb_setup.py:_flush():79] Loading settings from /data/align-anything/hantao/align-anything/scripts/wandb/settings
5
+ 2024-10-25 18:06:20,369 INFO MainThread:935352 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'api_key': '***REDACTED***', 'mode': 'online'}
6
+ 2024-10-25 18:06:20,369 INFO MainThread:935352 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
7
+ 2024-10-25 18:06:20,369 WARNING MainThread:935352 [wandb_setup.py:_flush():79] Could not find program at -m align_anything.trainers.text_image_to_text_image.ppo
8
+ 2024-10-25 18:06:20,369 INFO MainThread:935352 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m align_anything.trainers.text_image_to_text_image.ppo'}
9
+ 2024-10-25 18:06:20,369 INFO MainThread:935352 [wandb_setup.py:_flush():79] Applying login settings: {}
10
+ 2024-10-25 18:06:20,369 INFO MainThread:935352 [wandb_init.py:_log_setup():532] Logging user logs to ../outputs/ppo_ti2ti_baseline_1025_with_eval/wandb/run-20241025_180620-eoegk43l/logs/debug.log
11
+ 2024-10-25 18:06:20,369 INFO MainThread:935352 [wandb_init.py:_log_setup():533] Logging internal logs to ../outputs/ppo_ti2ti_baseline_1025_with_eval/wandb/run-20241025_180620-eoegk43l/logs/debug-internal.log
12
+ 2024-10-25 18:06:20,369 INFO MainThread:935352 [wandb_init.py:init():617] calling init triggers
13
+ 2024-10-25 18:06:20,369 INFO MainThread:935352 [wandb_init.py:init():624] wandb.init called with sweep_config: {}
14
+ config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_prompt_batch_size': 8, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 2, 'actor_gradient_checkpointing': True, 'critic_gradient_checkpointing': True, 'actor_lr': 1e-05, 'actor_lr_scheduler_type': 'cosine', 'actor_lr_warmup_ratio': 0.03, 'actor_weight_decay': 0.01, 'critic_lr': 5e-06, 'critic_lr_scheduler_type': 'constant', 'critic_lr_warmup_ratio': 0.03, 'critic_weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'kl_coeff': 0.02, 'clip_range_ratio': 0.2, 'clip_range_score': 50.0, 'clip_range_value': 5.0, 'ptx_coeff': 16.0, 'gamma': 1.0, 'gae_lambda': 0.95, 'normalize_reward': False, 'update_iters': 1, 'freeze_mm_proj': True, 'freeze_vision_tower': False, 'freeze_language_model': True}, 'data_cfgs': {'train_datasets': '/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs', 'train_template': 'spavl_ti2ti', 'train_size': 5000, 'train_split': None, 'train_subset': None, 'train_data_files': 'ti2ti_preference_filtered_tokenize_full.pt', 'train_optional_args': [], 'eval_datasets': None, 'eval_template': None, 'eval_size': None, 'eval_split': None, 'eval_subset': None, 'eval_data_files': None, 'eval_optional_args': [], 'ptx_datasets': None, 'ptx_template': 'spavl_ti2ti', 'ptx_size': None, 'ptx_subset': None, 'ptx_split': None, 'ptx_data_files': 'ti2ti_ptx_27k.pt', 'ptx_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'ppo', 'output_dir': '../outputs/ppo_ti2ti_baseline_1025_with_eval', 'cache_dir': None, 'save_interval': 30.0}, 'model_cfgs': {'actor_model_name_or_path': '/data/align-anything/hantao/models/0916_ti_to_ti_sft', 'reward_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400', 'reward_critic_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400', 'trust_remote_code': True, 'model_max_length': 2048, 'temperature': 1.0, 'top_p': 1.0, 'repetition_penalty': 1.0}, 'lora_cfgs': {'use_lora': False, 'task_type': 'TaskType.CAUSAL_LM', 'inference_mode': False, 'r': 16, 'lora_alpha': 16, 'lora_dropout': 0.1, 'target_modules': ['q_proj', 'v_proj'], 'save_full_model': True}, 'bnb_cfgs': {'use_bnb': False, 'load_in_4bit': True, 'load_in_8bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'float16'}, 'special_tokens': None}
15
+ 2024-10-25 18:06:20,370 INFO MainThread:935352 [wandb_init.py:init():667] starting backend
16
+ 2024-10-25 18:06:20,370 INFO MainThread:935352 [wandb_init.py:init():671] sending inform_init request
17
+ 2024-10-25 18:06:20,374 INFO MainThread:935352 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
18
+ 2024-10-25 18:06:20,375 INFO MainThread:935352 [wandb_init.py:init():684] backend started and connected
19
+ 2024-10-25 18:06:20,379 INFO MainThread:935352 [wandb_init.py:init():779] updated telemetry
20
+ 2024-10-25 18:06:20,389 INFO MainThread:935352 [wandb_init.py:init():812] communicating run to backend with 90.0 second timeout
21
+ 2024-10-25 18:06:21,024 INFO MainThread:935352 [wandb_init.py:init():863] starting run threads in backend
22
+ 2024-10-25 18:06:21,186 INFO MainThread:935352 [wandb_run.py:_console_start():2465] atexit reg
23
+ 2024-10-25 18:06:21,186 INFO MainThread:935352 [wandb_run.py:_redirect():2313] redirect: wrap_raw
24
+ 2024-10-25 18:06:21,186 INFO MainThread:935352 [wandb_run.py:_redirect():2378] Wrapping output streams.
25
+ 2024-10-25 18:06:21,186 INFO MainThread:935352 [wandb_run.py:_redirect():2403] Redirects installed.
26
+ 2024-10-25 18:06:21,189 INFO MainThread:935352 [wandb_init.py:init():907] run started, returning control to user process
wandb/run-20241025_180620-eoegk43l/run-eoegk43l.wandb ADDED
File without changes
wandb/run-20241025_181518-qbvp2oju/files/config.yaml ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.18.3
4
+ m: []
5
+ python_version: 3.11.10
6
+ t:
7
+ "1":
8
+ - 1
9
+ - 11
10
+ - 41
11
+ - 49
12
+ - 51
13
+ - 55
14
+ - 71
15
+ - 83
16
+ - 98
17
+ - 105
18
+ "2":
19
+ - 1
20
+ - 11
21
+ - 41
22
+ - 49
23
+ - 51
24
+ - 55
25
+ - 71
26
+ - 83
27
+ - 98
28
+ - 105
29
+ "3":
30
+ - 2
31
+ - 13
32
+ - 16
33
+ - 23
34
+ - 55
35
+ - 61
36
+ "4": 3.11.10
37
+ "5": 0.18.3
38
+ "6": 4.44.0.dev0
39
+ "8":
40
+ - 5
41
+ "12": 0.18.3
42
+ "13": linux-x86_64
43
+ bnb_cfgs:
44
+ value:
45
+ bnb_4bit_compute_dtype: float16
46
+ bnb_4bit_quant_type: nf4
47
+ bnb_4bit_use_double_quant: true
48
+ load_in_4bit: true
49
+ load_in_8bit: false
50
+ use_bnb: false
51
+ data_cfgs:
52
+ value:
53
+ eval_data_files: null
54
+ eval_datasets: null
55
+ eval_optional_args: []
56
+ eval_size: null
57
+ eval_split: null
58
+ eval_subset: null
59
+ eval_template: null
60
+ ptx_data_files: ti2ti_ptx_27k.pt
61
+ ptx_datasets: null
62
+ ptx_optional_args: []
63
+ ptx_size: null
64
+ ptx_split: null
65
+ ptx_subset: null
66
+ ptx_template: spavl_ti2ti
67
+ train_data_files: ti2ti_llf_prompt_only_tokenize.pt
68
+ train_datasets: /data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs
69
+ train_optional_args: []
70
+ train_size: 5000
71
+ train_split: null
72
+ train_subset: null
73
+ train_template: spavl_ti2ti
74
+ logger_cfgs:
75
+ value:
76
+ cache_dir: null
77
+ log_project: align-anything
78
+ log_run_name: ppo
79
+ log_type: wandb
80
+ output_dir: ../outputs/ppo_ti2ti_baseline_1025_with_eval
81
+ save_interval: 30
82
+ lora_cfgs:
83
+ value:
84
+ inference_mode: false
85
+ lora_alpha: 16
86
+ lora_dropout: 0.1
87
+ r: 16
88
+ save_full_model: true
89
+ target_modules:
90
+ - q_proj
91
+ - v_proj
92
+ task_type: TaskType.CAUSAL_LM
93
+ use_lora: false
94
+ model_cfgs:
95
+ value:
96
+ actor_model_name_or_path: /data/align-anything/hantao/models/0916_ti_to_ti_sft
97
+ model_max_length: 2048
98
+ repetition_penalty: 1
99
+ reward_critic_model_name_or_path: /data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400
100
+ reward_model_name_or_path: /data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400
101
+ temperature: 1
102
+ top_p: 1
103
+ trust_remote_code: true
104
+ special_tokens:
105
+ value: null
106
+ train_cfgs:
107
+ value:
108
+ actor_gradient_checkpointing: true
109
+ actor_lr: 1e-05
110
+ actor_lr_scheduler_type: cosine
111
+ actor_lr_warmup_ratio: 0.03
112
+ actor_weight_decay: 0.01
113
+ adam_betas:
114
+ - 0.9
115
+ - 0.95
116
+ bf16: true
117
+ clip_range_ratio: 0.2
118
+ clip_range_score: 50
119
+ clip_range_value: 5
120
+ critic_gradient_checkpointing: true
121
+ critic_lr: 5e-06
122
+ critic_lr_scheduler_type: constant
123
+ critic_lr_warmup_ratio: 0.03
124
+ critic_weight_decay: 0
125
+ ds_cfgs: ds_z3_config.json
126
+ epochs: 3
127
+ eval_interval: 10
128
+ eval_strategy: epoch
129
+ fp16: false
130
+ freeze_language_model: true
131
+ freeze_mm_proj: true
132
+ freeze_vision_tower: false
133
+ gae_lambda: 0.95
134
+ gamma: 1
135
+ gradient_accumulation_steps: 2
136
+ kl_coeff: 0.02
137
+ normalize_reward: false
138
+ per_device_eval_batch_size: 8
139
+ per_device_prompt_batch_size: 8
140
+ per_device_train_batch_size: 8
141
+ ptx_coeff: 16
142
+ seed: 42
143
+ update_iters: 1
wandb/run-20241025_181518-qbvp2oju/files/output.log ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ***** Running training *****
2
+ Training 1/3 epoch: 0%| | 0/237 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
3
+
4
+ [2024-10-25 18:23:22,854] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
5
+ [2024-10-25 18:23:27,196] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
6
+ [2024-10-25 18:27:32,436] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
7
+ [2024-10-25 18:27:37,446] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
8
+ [2024-10-25 18:32:36,133] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
9
+ [2024-10-25 18:32:41,569] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
10
+ [2024-10-25 18:36:08,160] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
11
+ [2024-10-25 18:36:12,414] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
12
+ [2024-10-25 18:39:02,940] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
13
+ [2024-10-25 18:39:07,161] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
14
+ [2024-10-25 18:41:58,177] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
15
+ [2024-10-25 18:42:02,629] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
16
+ [2024-10-25 18:43:57,357] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
17
+ [2024-10-25 18:44:01,125] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
18
+ [2024-10-25 18:45:40,196] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
19
+ [2024-10-25 18:45:44,071] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
20
+ [2024-10-25 18:47:40,184] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
21
+ [2024-10-25 18:47:40,185] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[9.908858470377793e-06, 9.908858470377793e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
22
+ [2024-10-25 18:47:40,185] [INFO] [timer.py:264:stop] epoch=0/micro_step=20/global_step=10, RunningAvgSamplesPerSec=14.760689632781455, CurrSamplesPerSec=16.90950583092757, MemAllocated=33.11GB, MaxMemAllocated=47.22GB
23
+ [2024-10-25 18:47:43,802] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
24
+ [2024-10-25 18:47:43,802] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
25
+ [2024-10-25 18:47:43,803] [INFO] [timer.py:264:stop] epoch=0/micro_step=20/global_step=10, RunningAvgSamplesPerSec=15.436775313806873, CurrSamplesPerSec=17.42788205848213, MemAllocated=33.11GB, MaxMemAllocated=47.22GB
26
+ [2024-10-25 18:48:48,658] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
27
+ [2024-10-25 18:48:52,182] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
28
+ [2024-10-25 18:49:14,049] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
29
+ [2024-10-25 18:49:17,288] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
30
+ [2024-10-25 18:49:38,002] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
31
+ [2024-10-25 18:49:41,318] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
32
+ [2024-10-25 18:50:04,280] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
33
+ [2024-10-25 18:50:07,681] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
34
+ Saving checkpoint at step 30 ...
35
+ Saving model to "../outputs/ppo_ti2ti_baseline_1025_with_eval" ...
36
+ Saving 16-bit model...
37
+ [2024-10-25 18:50:21,078] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step15 is about to be saved!
38
+ [2024-10-25 18:50:21,079] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_30.bin, tag: global_step15
39
+ [2024-10-25 18:50:21,079] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_30.bin...
40
+ [2024-10-25 18:50:36,754] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_30.bin.
41
+ [2024-10-25 18:50:36,755] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step15 is ready now!
42
+ Model saved!
43
+ Saving 16-bit model...
44
+ [2024-10-25 18:50:47,513] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step15 is about to be saved!
45
+ [2024-10-25 18:50:47,514] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_30.bin, tag: global_step15
46
+ [2024-10-25 18:50:47,514] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_30.bin...
47
+ [2024-10-25 18:51:06,021] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_30.bin.
48
+ [2024-10-25 18:51:06,021] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step15 is ready now!
49
+ Model saved!
50
+ Model saved!
51
+ Checkpoint saved.
52
+ [2024-10-25 18:51:27,052] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
53
+ [2024-10-25 18:51:30,347] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
54
+ [2024-10-25 18:51:50,999] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
55
+ [2024-10-25 18:51:54,300] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
56
+ [2024-10-25 18:52:19,971] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
57
+ [2024-10-25 18:52:23,379] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
58
+ [2024-10-25 18:52:45,642] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
59
+ [2024-10-25 18:52:48,945] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
60
+ [2024-10-25 18:53:09,437] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
61
+ [2024-10-25 18:53:09,438] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[9.470431355738257e-06, 9.470431355738257e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
62
+ [2024-10-25 18:53:09,438] [INFO] [timer.py:264:stop] epoch=0/micro_step=40/global_step=20, RunningAvgSamplesPerSec=16.735259629101844, CurrSamplesPerSec=18.7639256774801, MemAllocated=33.18GB, MaxMemAllocated=47.22GB
63
+ [2024-10-25 18:53:12,725] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
64
+ [2024-10-25 18:53:12,725] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
65
+ [2024-10-25 18:53:12,726] [INFO] [timer.py:264:stop] epoch=0/micro_step=40/global_step=20, RunningAvgSamplesPerSec=17.458411197853028, CurrSamplesPerSec=19.786118059003716, MemAllocated=33.18GB, MaxMemAllocated=47.22GB
66
+ [2024-10-25 18:54:19,714] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
67
+ [2024-10-25 18:54:23,021] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
68
+ [2024-10-25 18:55:07,181] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
69
+ [2024-10-25 18:55:10,462] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
70
+ [2024-10-25 18:55:30,696] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
71
+ [2024-10-25 18:55:34,030] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
72
+ [2024-10-25 18:55:54,073] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
73
+ [2024-10-25 18:55:57,348] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
74
+ [2024-10-25 18:57:04,748] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[8.70045279830626e-06, 8.70045279830626e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
75
+ [2024-10-25 18:57:04,749] [INFO] [timer.py:264:stop] epoch=0/micro_step=60/global_step=30, RunningAvgSamplesPerSec=17.50081132260227, CurrSamplesPerSec=19.186032450205442, MemAllocated=33.15GB, MaxMemAllocated=47.22GB
76
+ [2024-10-25 18:57:08,050] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
77
+ [2024-10-25 18:57:08,051] [INFO] [timer.py:264:stop] epoch=0/micro_step=60/global_step=30, RunningAvgSamplesPerSec=18.204578219677924, CurrSamplesPerSec=19.780692782398315, MemAllocated=33.15GB, MaxMemAllocated=47.22GB
78
+ Saving checkpoint at step 60 ...
79
+ Saving model to "../outputs/ppo_ti2ti_baseline_1025_with_eval" ...
80
+ Saving 16-bit model...
81
+ [2024-10-25 18:57:21,740] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step30 is about to be saved!
82
+ [2024-10-25 18:57:21,741] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_60.bin, tag: global_step30
83
+ [2024-10-25 18:57:21,741] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_60.bin...
84
+ [2024-10-25 18:57:38,185] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_60.bin.
85
+ [2024-10-25 18:57:38,186] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step30 is ready now!
86
+ Model saved!
87
+ Saving 16-bit model...
88
+ [2024-10-25 18:57:46,170] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step30 is about to be saved!
89
+ [2024-10-25 18:57:46,171] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_60.bin, tag: global_step30
90
+ [2024-10-25 18:57:46,171] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_60.bin...
91
+ [2024-10-25 18:58:04,694] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_60.bin.
92
+ [2024-10-25 18:58:04,694] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step30 is ready now!
93
+ Model saved!
94
+ Model saved!
95
+ Checkpoint saved.
96
+ [2024-10-25 18:59:12,139] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
97
+ [2024-10-25 18:59:15,443] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
98
+ [2024-10-25 18:59:35,560] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
99
+ [2024-10-25 18:59:38,857] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
100
+ [2024-10-25 19:01:48,333] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=0, lr=[7.656028585269017e-06, 7.656028585269017e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
101
+ [2024-10-25 19:01:48,334] [INFO] [timer.py:264:stop] epoch=0/micro_step=80/global_step=40, RunningAvgSamplesPerSec=18.027035548451977, CurrSamplesPerSec=29.34925562325487, MemAllocated=33.13GB, MaxMemAllocated=47.22GB
102
+ [2024-10-25 19:01:51,645] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
103
+ [2024-10-25 19:01:51,646] [INFO] [timer.py:264:stop] epoch=0/micro_step=80/global_step=40, RunningAvgSamplesPerSec=18.74402073106821, CurrSamplesPerSec=30.575559928357755, MemAllocated=33.13GB, MaxMemAllocated=47.22GB
104
+ [2024-10-25 19:02:12,030] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
105
+ [2024-10-25 19:02:15,280] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
106
+ [2024-10-25 19:02:58,678] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
107
+ [2024-10-25 19:03:01,948] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
108
+ [2024-10-25 19:03:22,304] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
109
+ [2024-10-25 19:03:25,588] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
110
+ [2024-10-25 19:03:45,571] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
111
+ [2024-10-25 19:03:48,886] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
112
+ Saving checkpoint at step 90 ...
113
+ Saving model to "../outputs/ppo_ti2ti_baseline_1025_with_eval" ...
114
+ Saving 16-bit model...
115
+ [2024-10-25 19:04:00,883] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step45 is about to be saved!
116
+ [2024-10-25 19:04:00,885] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_90.bin, tag: global_step45
117
+ [2024-10-25 19:04:00,885] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_90.bin...
118
+ [2024-10-25 19:04:18,000] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_90.bin.
119
+ [2024-10-25 19:04:18,001] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step45 is ready now!
120
+ Model saved!
121
+ Saving 16-bit model...
122
+ [2024-10-25 19:04:26,278] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step45 is about to be saved!
123
+ [2024-10-25 19:04:26,279] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_90.bin, tag: global_step45
124
+ [2024-10-25 19:04:26,279] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_90.bin...
125
+ [2024-10-25 19:04:45,735] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_90.bin.
126
+ [2024-10-25 19:04:45,737] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step45 is ready now!
127
+ Model saved!
128
+ Model saved!
129
+ Checkpoint saved.
130
+ [2024-10-25 19:06:15,770] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
131
+ [2024-10-25 19:06:19,105] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
132
+ [2024-10-25 19:06:39,673] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
133
+ [2024-10-25 19:06:39,674] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=0, lr=[6.41461888258465e-06, 6.41461888258465e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
134
+ [2024-10-25 19:06:39,675] [INFO] [timer.py:264:stop] epoch=0/micro_step=100/global_step=50, RunningAvgSamplesPerSec=18.196574738389252, CurrSamplesPerSec=18.16333305616454, MemAllocated=33.11GB, MaxMemAllocated=47.22GB
135
+ [2024-10-25 19:06:43,151] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
136
+ [2024-10-25 19:06:43,152] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
137
+ [2024-10-25 19:06:43,153] [INFO] [timer.py:264:stop] epoch=0/micro_step=100/global_step=50, RunningAvgSamplesPerSec=18.924891588094603, CurrSamplesPerSec=19.22042501406237, MemAllocated=33.11GB, MaxMemAllocated=47.22GB
138
+ [2024-10-25 19:08:14,506] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
139
+ [2024-10-25 19:08:17,754] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
140
+ [2024-10-25 19:09:25,071] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
141
+ [2024-10-25 19:09:28,498] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
142
+ [2024-10-25 19:10:35,663] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
143
+ [2024-10-25 19:10:35,664] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=0, lr=[5.068293368829755e-06, 5.068293368829755e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
144
+ [2024-10-25 19:10:35,665] [INFO] [timer.py:264:stop] epoch=0/micro_step=120/global_step=60, RunningAvgSamplesPerSec=18.322746746684054, CurrSamplesPerSec=19.371088818318672, MemAllocated=33.14GB, MaxMemAllocated=47.22GB
145
+ [2024-10-25 19:10:38,940] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
146
+ [2024-10-25 19:10:38,940] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
147
+ [2024-10-25 19:10:38,941] [INFO] [timer.py:264:stop] epoch=0/micro_step=120/global_step=60, RunningAvgSamplesPerSec=19.029045377480585, CurrSamplesPerSec=19.888041858045842, MemAllocated=33.14GB, MaxMemAllocated=47.22GB
148
+ Saving checkpoint at step 120 ...
149
+ Saving model to "../outputs/ppo_ti2ti_baseline_1025_with_eval" ...
150
+ Saving 16-bit model...
151
+ [2024-10-25 19:10:54,426] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step60 is about to be saved!
152
+ [2024-10-25 19:10:54,428] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_120.bin, tag: global_step60
153
+ [2024-10-25 19:10:54,428] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_120.bin...
154
+ [2024-10-25 19:11:13,388] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_120.bin.
155
+ [2024-10-25 19:11:13,390] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step60 is ready now!
156
+ Model saved!
157
+ Saving 16-bit model...
158
+ [2024-10-25 19:11:22,464] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step60 is about to be saved!
159
+ [2024-10-25 19:11:22,465] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_120.bin, tag: global_step60
160
+ [2024-10-25 19:11:22,466] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_120.bin...
161
+ [2024-10-25 19:11:39,535] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_120.bin.
162
+ [2024-10-25 19:11:39,535] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step60 is ready now!
163
+ Model saved!
164
+ Model saved!
165
+ Checkpoint saved.
166
+ [2024-10-25 19:11:59,374] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
167
+ [2024-10-25 19:12:02,660] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
168
+ [2024-10-25 19:12:22,755] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
169
+ [2024-10-25 19:12:26,038] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
170
+ [2024-10-25 19:12:46,474] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
171
+ [2024-10-25 19:12:49,933] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
172
+ [2024-10-25 19:13:10,102] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
173
+ [2024-10-25 19:13:13,472] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
174
+ [2024-10-25 19:14:20,491] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
175
+ [2024-10-25 19:14:23,905] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
176
+ [2024-10-25 19:15:07,328] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
177
+ [2024-10-25 19:15:10,628] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
178
+ [2024-10-25 19:15:30,619] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=0, lr=[3.7169028483301333e-06, 3.7169028483301333e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
179
+ [2024-10-25 19:15:30,619] [INFO] [timer.py:264:stop] epoch=0/micro_step=140/global_step=70, RunningAvgSamplesPerSec=18.421273695026542, CurrSamplesPerSec=19.229435871608736, MemAllocated=33.13GB, MaxMemAllocated=47.22GB
180
+ [2024-10-25 19:15:33,910] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
181
+ [2024-10-25 19:15:33,911] [INFO] [timer.py:264:stop] epoch=0/micro_step=140/global_step=70, RunningAvgSamplesPerSec=19.12443035480107, CurrSamplesPerSec=19.765868285523876, MemAllocated=33.13GB, MaxMemAllocated=47.22GB
182
+ [2024-10-25 19:15:53,825] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
183
+ [2024-10-25 19:15:57,143] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
184
+ [2024-10-25 19:16:17,587] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
185
+ [2024-10-25 19:16:20,871] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
186
+ [2024-10-25 19:16:40,921] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
187
+ [2024-10-25 19:16:44,213] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
188
+ [2024-10-25 19:17:04,343] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
189
+ [2024-10-25 19:17:07,629] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
190
+ Saving checkpoint at step 150 ...
191
+ Saving model to "../outputs/ppo_ti2ti_baseline_1025_with_eval" ...
192
+ Saving 16-bit model...
193
+ [2024-10-25 19:17:42,434] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step75 is about to be saved!
194
+ [2024-10-25 19:17:42,436] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_150.bin, tag: global_step75
195
+ [2024-10-25 19:17:42,436] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_150.bin...
196
+ [2024-10-25 19:18:02,484] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_150.bin.
197
+ [2024-10-25 19:18:02,486] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step75 is ready now!
198
+ Model saved!
199
+ Saving 16-bit model...
200
+ [2024-10-25 19:18:11,754] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step75 is about to be saved!
201
+ [2024-10-25 19:18:11,755] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_150.bin, tag: global_step75
202
+ [2024-10-25 19:18:11,755] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_150.bin...
203
+ [2024-10-25 19:18:28,942] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_150.bin.
204
+ [2024-10-25 19:18:28,944] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step75 is ready now!
205
+ Model saved!
206
+ Model saved!
207
+ Checkpoint saved.
208
+ [2024-10-25 19:18:48,635] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
209
+ [2024-10-25 19:18:51,897] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
210
+ [2024-10-25 19:20:14,068] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
211
+ [2024-10-25 19:20:14,069] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=0, lr=[2.4606737737909696e-06, 2.4606737737909696e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
212
+ [2024-10-25 19:20:14,070] [INFO] [timer.py:264:stop] epoch=0/micro_step=160/global_step=80, RunningAvgSamplesPerSec=18.583796025985635, CurrSamplesPerSec=18.851139953128662, MemAllocated=33.29GB, MaxMemAllocated=47.22GB
213
+ [2024-10-25 19:20:17,500] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
214
+ [2024-10-25 19:20:17,500] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
215
+ [2024-10-25 19:20:17,501] [INFO] [timer.py:264:stop] epoch=0/micro_step=160/global_step=80, RunningAvgSamplesPerSec=19.286477338971316, CurrSamplesPerSec=19.326753490145233, MemAllocated=33.29GB, MaxMemAllocated=47.22GB
216
+ [2024-10-25 19:21:24,498] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
217
+ [2024-10-25 19:21:27,888] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
218
+ [2024-10-25 19:21:47,842] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
219
+ [2024-10-25 19:21:51,137] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
220
+ [2024-10-25 19:22:11,123] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
221
+ [2024-10-25 19:22:14,405] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
222
+ [2024-10-25 19:22:57,887] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
223
+ [2024-10-25 19:23:01,180] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
224
+ [2024-10-25 19:23:44,745] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
225
+ [2024-10-25 19:23:48,058] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
226
+ [2024-10-25 19:24:08,920] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=0, lr=[1.3927749088052218e-06, 1.3927749088052218e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
227
+ [2024-10-25 19:24:08,921] [INFO] [timer.py:264:stop] epoch=0/micro_step=180/global_step=90, RunningAvgSamplesPerSec=18.631064887248623, CurrSamplesPerSec=18.182446863655244, MemAllocated=33.13GB, MaxMemAllocated=47.22GB
228
+ [2024-10-25 19:24:12,425] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
229
+ [2024-10-25 19:24:12,426] [INFO] [timer.py:264:stop] epoch=0/micro_step=180/global_step=90, RunningAvgSamplesPerSec=19.325484075258544, CurrSamplesPerSec=18.68740115377941, MemAllocated=33.13GB, MaxMemAllocated=47.22GB
230
+ Saving checkpoint at step 180 ...
231
+ Saving model to "../outputs/ppo_ti2ti_baseline_1025_with_eval" ...
232
+ Saving 16-bit model...
233
+ [2024-10-25 19:24:23,716] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step90 is about to be saved!
234
+ [2024-10-25 19:24:23,717] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_180.bin, tag: global_step90
235
+ [2024-10-25 19:24:23,717] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_180.bin...
236
+ [2024-10-25 19:24:41,475] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_180.bin.
237
+ [2024-10-25 19:24:41,476] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step90 is ready now!
238
+ Model saved!
239
+ Saving 16-bit model...
240
+ [2024-10-25 19:24:50,478] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step90 is about to be saved!
241
+ [2024-10-25 19:24:50,479] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_180.bin, tag: global_step90
242
+ [2024-10-25 19:24:50,480] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_180.bin...
243
+ [2024-10-25 19:25:12,036] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_180.bin.
244
+ [2024-10-25 19:25:12,039] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step90 is ready now!
245
+ Model saved!
246
+ Model saved!
247
+ Checkpoint saved.
248
+ [2024-10-25 19:27:06,252] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
249
+ [2024-10-25 19:27:09,573] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
250
+ [2024-10-25 19:28:40,360] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
251
+ [2024-10-25 19:28:43,635] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
252
+ [2024-10-25 19:29:03,608] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
253
+ [2024-10-25 19:29:03,610] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=0, lr=[5.924074268766422e-07, 5.924074268766422e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
254
+ [2024-10-25 19:29:03,610] [INFO] [timer.py:264:stop] epoch=0/micro_step=200/global_step=100, RunningAvgSamplesPerSec=18.66696592651746, CurrSamplesPerSec=19.246283042032882, MemAllocated=33.12GB, MaxMemAllocated=47.22GB
255
+ [2024-10-25 19:29:06,852] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
256
+ [2024-10-25 19:29:06,853] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
257
+ [2024-10-25 19:29:06,853] [INFO] [timer.py:264:stop] epoch=0/micro_step=200/global_step=100, RunningAvgSamplesPerSec=19.358645620408343, CurrSamplesPerSec=20.0045335178505, MemAllocated=33.12GB, MaxMemAllocated=47.22GB
258
+ [2024-10-25 19:30:37,368] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
259
+ [2024-10-25 19:30:40,629] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
260
+ [2024-10-25 19:31:00,782] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
261
+ [2024-10-25 19:31:04,118] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
262
+ Saving checkpoint at step 210 ...
263
+ Saving model to "../outputs/ppo_ti2ti_baseline_1025_with_eval" ...
264
+ Saving 16-bit model...
265
+ [2024-10-25 19:31:14,905] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step105 is about to be saved!
266
+ [2024-10-25 19:31:14,906] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_210.bin, tag: global_step105
267
+ [2024-10-25 19:31:14,907] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_210.bin...
268
+ [2024-10-25 19:31:30,468] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_210.bin.
269
+ [2024-10-25 19:31:30,471] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step105 is ready now!
270
+ Model saved!
271
+ Saving 16-bit model...
272
+ [2024-10-25 19:31:37,840] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step105 is about to be saved!
273
+ [2024-10-25 19:31:37,842] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_210.bin, tag: global_step105
274
+ [2024-10-25 19:31:37,842] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_210.bin...
275
+ [2024-10-25 19:31:59,787] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_210.bin.
276
+ [2024-10-25 19:31:59,790] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step105 is ready now!
277
+ Model saved!
278
+ Model saved!
279
+ Checkpoint saved.
280
+ [2024-10-25 19:33:53,254] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=0, lr=[1.1893092270227724e-07, 1.1893092270227724e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
281
+ [2024-10-25 19:33:53,255] [INFO] [timer.py:264:stop] epoch=0/micro_step=220/global_step=110, RunningAvgSamplesPerSec=18.708463115192874, CurrSamplesPerSec=19.308977655910134, MemAllocated=33.13GB, MaxMemAllocated=47.22GB
282
+ [2024-10-25 19:33:56,503] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
283
+ [2024-10-25 19:33:56,504] [INFO] [timer.py:264:stop] epoch=0/micro_step=220/global_step=110, RunningAvgSamplesPerSec=19.393453277752876, CurrSamplesPerSec=19.924871782255472, MemAllocated=33.13GB, MaxMemAllocated=47.22GB
284
+ [2024-10-25 19:34:40,368] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
285
+ [2024-10-25 19:34:43,678] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
286
+ [2024-10-25 19:35:03,709] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
287
+ [2024-10-25 19:35:07,004] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
288
+ [2024-10-25 19:35:50,234] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
289
+ [2024-10-25 19:35:53,480] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
290
+ [2024-10-25 19:36:13,581] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
291
+ [2024-10-25 19:36:16,864] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
292
+ Saving model to "../outputs/ppo_ti2ti_baseline_1025_with_eval" ...
293
+ Saving 16-bit model...
294
+ [2024-10-25 19:37:19,241] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step118 is about to be saved!
295
+ [2024-10-25 19:37:19,242] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model.bin, tag: global_step118
296
+ [2024-10-25 19:37:19,243] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model.bin...
297
+ [2024-10-25 19:37:40,063] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model.bin.
298
+ [2024-10-25 19:37:40,065] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step118 is ready now!
299
+ Model saved!
300
+ Saving 16-bit model...
301
+ [2024-10-25 19:37:49,384] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step118 is about to be saved!
302
+ [2024-10-25 19:37:49,385] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model.bin, tag: global_step118
303
+ [2024-10-25 19:37:49,386] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model.bin...
304
+ [2024-10-25 19:38:13,508] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model.bin.
305
+ [2024-10-25 19:38:13,511] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step118 is ready now!
306
+ Model saved!
307
+ Model saved!
wandb/run-20241025_181518-qbvp2oju/files/requirements.txt ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ align-anything==0.0.1.dev0
2
+ torch==2.4.0
3
+ pycparser==2.22
4
+ torchvision==0.19.0
5
+ multiprocess==0.70.16
6
+ braceexpand==0.1.7
7
+ lm-format-enforcer==0.10.6
8
+ Jinja2==3.1.4
9
+ scikit-learn==1.5.2
10
+ interegular==0.3.3
11
+ starlette==0.38.6
12
+ huggingface-hub==0.25.2
13
+ pyairports==2.1.1
14
+ protobuf==3.20.3
15
+ term-image==0.7.2
16
+ python-dateutil==2.9.0.post0
17
+ identify==2.6.1
18
+ tokenizers==0.19.1
19
+ tensorboard-data-server==0.7.2
20
+ numba==0.60.0
21
+ ninja==1.11.1.1
22
+ nvidia-cuda-cupti-cu12==12.1.105
23
+ diskcache==5.6.3
24
+ pycountry==24.6.1
25
+ py-cpuinfo==9.0.0
26
+ scipy==1.14.1
27
+ soxr==0.5.0.post1
28
+ prometheus-fastapi-instrumentator==7.0.0
29
+ align-anything==0.0.1.dev0
30
+ virtualenv==20.26.6
31
+ hjson==3.1.0
32
+ nvidia-cudnn-cu12==9.1.0.70
33
+ termcolor==2.5.0
34
+ grpcio==1.66.2
35
+ wheel==0.44.0
36
+ torchlibrosa==0.1.0
37
+ numpy==1.26.4
38
+ msgpack==1.1.0
39
+ rpds-py==0.20.0
40
+ annotated-types==0.7.0
41
+ pre_commit==4.0.1
42
+ aiohttp==3.10.10
43
+ audioread==3.0.1
44
+ lazy_loader==0.4
45
+ nvidia-cuda-runtime-cu12==12.1.105
46
+ filelock==3.16.1
47
+ timm==0.6.13
48
+ anyio==4.6.0
49
+ pydantic_core==2.23.4
50
+ idna==3.10
51
+ fastapi==0.115.0
52
+ wandb==0.18.3
53
+ packaging==24.1
54
+ yt-dlp==2024.8.6
55
+ matplotlib==3.9.2
56
+ websockets==12.0
57
+ triton==3.0.0
58
+ zipp==3.20.2
59
+ requests==2.32.3
60
+ xxhash==3.5.0
61
+ image-reward==1.5
62
+ pytorch-fid==0.3.0
63
+ imageio-ffmpeg==0.5.1
64
+ args==0.1.0
65
+ llvmlite==0.43.0
66
+ peft==0.13.2
67
+ openai==1.51.2
68
+ httpx==0.27.2
69
+ nvidia-cublas-cu12==12.1.3.1
70
+ pytest-split==0.8.0
71
+ ruff==0.6.9
72
+ sniffio==1.3.1
73
+ yarl==1.15.0
74
+ pandas==2.2.3
75
+ fsspec==2024.6.1
76
+ gguf==0.10.0
77
+ diffusers==0.30.3
78
+ platformdirs==4.3.6
79
+ nvidia-cuda-nvrtc-cu12==12.1.105
80
+ imageio==2.35.1
81
+ Brotli==1.1.0
82
+ bitsandbytes==0.44.1
83
+ hpsv2==1.2.0
84
+ lark==1.2.2
85
+ gradio==5.0.2
86
+ pydantic==2.9.2
87
+ pytz==2024.2
88
+ jsonschema-specifications==2024.10.1
89
+ deepspeed==0.15.2
90
+ cloudpickle==3.1.0
91
+ distro==1.9.0
92
+ aiohappyeyeballs==2.4.3
93
+ Markdown==3.7
94
+ docker-pycreds==0.4.0
95
+ semantic-version==2.10.0
96
+ resampy==0.4.3
97
+ urllib3==2.2.3
98
+ nodeenv==1.9.1
99
+ click==8.1.7
100
+ accelerate==1.0.1
101
+ dill==0.3.8
102
+ setproctitle==1.3.3
103
+ httpcore==1.0.6
104
+ pooch==1.8.2
105
+ importlib_metadata==8.5.0
106
+ cfgv==3.4.0
107
+ einops==0.8.0
108
+ shellingham==1.5.4
109
+ pytest==7.2.0
110
+ python-dotenv==1.0.1
111
+ pydub==0.25.1
112
+ kiwisolver==1.4.7
113
+ aiofiles==23.2.1
114
+ vllm==0.6.2
115
+ Werkzeug==3.0.4
116
+ tensorboard==2.18.0
117
+ joblib==1.4.2
118
+ pycryptodomex==3.21.0
119
+ moviepy==1.0.3
120
+ typing_extensions==4.12.2
121
+ mdurl==0.1.2
122
+ mistral_common==1.4.4
123
+ rich==13.9.2
124
+ aiosignal==1.3.1
125
+ mmsg==0.1.dev20+g585c63a.d20241012
126
+ pillow==10.4.0
127
+ prometheus_client==0.21.0
128
+ nvidia-cusolver-cu12==11.4.5.107
129
+ typer==0.12.5
130
+ pyzmq==26.2.0
131
+ h11==0.14.0
132
+ gitdb==4.0.11
133
+ transformers==4.44.0.dev0
134
+ nvidia-nccl-cu12==2.20.5
135
+ jsonschema==4.23.0
136
+ soundfile==0.12.1
137
+ contourpy==1.3.0
138
+ mutagen==1.47.0
139
+ regex==2024.9.11
140
+ orjson==3.10.7
141
+ fairscale==0.4.13
142
+ partial-json-parser==0.2.1.1.post4
143
+ outlines==0.1.1.dev4+ga2fd35c
144
+ nvidia-curand-cu12==10.3.2.106
145
+ pluggy==1.5.0
146
+ GitPython==3.1.43
147
+ tzdata==2024.2
148
+ uvicorn==0.31.1
149
+ sentencepiece==0.2.0
150
+ decorator==4.4.2
151
+ nvidia-nvjitlink-cu12==12.6.77
152
+ distlib==0.3.9
153
+ uvloop==0.20.0
154
+ networkx==3.4.1
155
+ wcwidth==0.2.13
156
+ opencv-python==4.6.0.66
157
+ six==1.16.0
158
+ httptools==0.6.1
159
+ safetensors==0.4.5
160
+ nvidia-nvtx-cu12==12.1.105
161
+ markdown-it-py==3.0.0
162
+ certifi==2024.8.30
163
+ sentry-sdk==2.16.0
164
+ outlines_core==0.1.0
165
+ threadpoolctl==3.5.0
166
+ nvidia-cufft-cu12==11.0.2.54
167
+ datasets==3.0.1
168
+ cycler==0.12.1
169
+ psutil==6.0.0
170
+ nvidia-cusparse-cu12==12.1.0.106
171
+ shortuuid==1.0.13
172
+ ffmpy==0.4.0
173
+ xformers==0.0.27.post2
174
+ MarkupSafe==2.1.5
175
+ tqdm==4.66.5
176
+ gradio_client==1.4.0
177
+ attrs==24.2.0
178
+ optree==0.13.0
179
+ PyYAML==6.0.2
180
+ clint==0.5.1
181
+ torchaudio==2.4.0
182
+ frechet-audio-distance==0.1.2
183
+ frozenlist==1.4.1
184
+ clip==0.2.0
185
+ multidict==6.1.0
186
+ propcache==0.2.0
187
+ librosa==0.10.2.post1
188
+ webdataset==0.2.100
189
+ ray==2.37.0
190
+ pyparsing==3.1.4
191
+ pyarrow==17.0.0
192
+ tiktoken==0.7.0
193
+ watchfiles==0.24.0
194
+ proglog==0.1.10
195
+ cachetools==5.5.0
196
+ fonttools==4.54.1
197
+ charset-normalizer==3.4.0
198
+ ftfy==6.3.0
199
+ referencing==0.35.1
200
+ mpmath==1.3.0
201
+ msgspec==0.18.6
202
+ nvidia-ml-py==12.535.161
203
+ smmap==5.0.1
204
+ absl-py==2.1.0
205
+ python-multipart==0.0.12
206
+ Pygments==2.18.0
207
+ iniconfig==2.0.0
208
+ sympy==1.13.3
209
+ pip==24.2
210
+ airportsdata==20241001
211
+ tomlkit==0.12.0
212
+ nest-asyncio==1.6.0
213
+ setuptools==75.1.0
214
+ jiter==0.6.1
215
+ cffi==1.17.1
216
+ nvitop==1.3.2
217
+ backports.tarfile==1.2.0
218
+ zipp==3.19.2
219
+ inflect==7.3.1
220
+ autocommand==2.2.2
221
+ importlib_resources==6.4.0
222
+ packaging==24.1
223
+ jaraco.context==5.3.0
224
+ typeguard==4.3.0
225
+ more-itertools==10.3.0
226
+ jaraco.text==3.12.1
227
+ platformdirs==4.2.2
228
+ wheel==0.43.0
229
+ typing_extensions==4.12.2
230
+ importlib_metadata==8.0.0
231
+ tomli==2.0.1
232
+ jaraco.collections==5.1.0
233
+ jaraco.functools==4.0.1
wandb/run-20241025_181518-qbvp2oju/files/wandb-metadata.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-198-generic-x86_64-with-glibc2.31",
3
+ "python": "3.11.10",
4
+ "startedAt": "2024-10-25T18:15:18.983727Z",
5
+ "args": [
6
+ "--local_rank=0",
7
+ "--actor_model_name_or_path",
8
+ "/data/align-anything/hantao/models/0916_ti_to_ti_sft",
9
+ "--reward_model_name_or_path",
10
+ "/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400",
11
+ "--reward_critic_model_name_or_path",
12
+ "/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400",
13
+ "--train_datasets",
14
+ "/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs",
15
+ "--train_template",
16
+ "spavl_ti2ti",
17
+ "--train_data_files",
18
+ "ti2ti_llf_prompt_only_tokenize.pt",
19
+ "--ptx_template",
20
+ "spavl_ti2ti",
21
+ "--ptx_data_files",
22
+ "ti2ti_ptx_27k.pt",
23
+ "--output_dir",
24
+ "../outputs/ppo_ti2ti_baseline_1025_with_eval",
25
+ "--save_interval",
26
+ "30"
27
+ ],
28
+ "program": "-m align_anything.trainers.text_image_to_text_image.ppo",
29
+ "git": {
30
+ "remote": "https://github.com/PKU-Alignment/align-anything.git",
31
+ "commit": "6fde660afc9985323f147930eedf188a5699adc7"
32
+ },
33
+ "email": "[email protected]",
34
+ "root": "../outputs/ppo_ti2ti_baseline_1025_with_eval",
35
+ "host": "lyg0195",
36
+ "username": "align-anything",
37
+ "executable": "/home/align-anything/miniconda3/envs/hantao_cham/bin/python",
38
+ "cpu_count": 64,
39
+ "cpu_count_logical": 128,
40
+ "gpu": "[NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB]",
41
+ "gpu_count": 8,
42
+ "disk": {
43
+ "/": {
44
+ "total": "938421047296",
45
+ "used": "363102883840"
46
+ }
47
+ },
48
+ "memory": {
49
+ "total": "540647575552"
50
+ },
51
+ "cpu": {
52
+ "count": 64,
53
+ "countLogical": 128
54
+ },
55
+ "gpu_nvidia": [
56
+ {
57
+ "name": "NVIDIA A100-SXM4-80GB",
58
+ "memoryTotal": "85899345920",
59
+ "cudaCores": 6912,
60
+ "architecture": "Ampere"
61
+ },
62
+ {
63
+ "name": "NVIDIA A100-SXM4-80GB",
64
+ "memoryTotal": "85899345920",
65
+ "cudaCores": 6912,
66
+ "architecture": "Ampere"
67
+ },
68
+ {
69
+ "name": "NVIDIA A100-SXM4-80GB",
70
+ "memoryTotal": "85899345920",
71
+ "cudaCores": 6912,
72
+ "architecture": "Ampere"
73
+ },
74
+ {
75
+ "name": "NVIDIA A100-SXM4-80GB",
76
+ "memoryTotal": "85899345920",
77
+ "cudaCores": 6912,
78
+ "architecture": "Ampere"
79
+ },
80
+ {
81
+ "name": "NVIDIA A100-SXM4-80GB",
82
+ "memoryTotal": "85899345920",
83
+ "cudaCores": 6912,
84
+ "architecture": "Ampere"
85
+ },
86
+ {
87
+ "name": "NVIDIA A100-SXM4-80GB",
88
+ "memoryTotal": "85899345920",
89
+ "cudaCores": 6912,
90
+ "architecture": "Ampere"
91
+ },
92
+ {
93
+ "name": "NVIDIA A100-SXM4-80GB",
94
+ "memoryTotal": "85899345920",
95
+ "cudaCores": 6912,
96
+ "architecture": "Ampere"
97
+ },
98
+ {
99
+ "name": "NVIDIA A100-SXM4-80GB",
100
+ "memoryTotal": "85899345920",
101
+ "cudaCores": 6912,
102
+ "architecture": "Ampere"
103
+ }
104
+ ],
105
+ "cudaVersion": "12.4"
106
+ }
wandb/run-20241025_181518-qbvp2oju/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train/reward_critic_lr":5e-06,"train/reward_with_kl_penalty":-0.1957259476184845,"_wandb":{"runtime":4974},"_timestamp":1.729885027561649e+09,"train/reward_value":-0.89080810546875,"train/mean_generated_length":1,"train/actor_lr":0,"_step":236,"train/actor_loss":-0.6950821280479431,"train/max_generated_length":1,"train/kl_divergence":4.3175482749938965,"_runtime":4974.609715617,"train/reward_advantage":0.6950821280479431,"train/reward_return":-0.1957259476184845,"train/reward":-0.109375,"train/step":236,"train/reward_critic_loss":0.5039339065551758}
wandb/run-20241025_181518-qbvp2oju/logs/debug-internal.log ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2024-10-25T18:15:18.987617848Z","level":"INFO","msg":"using version","core version":"0.18.3"}
2
+ {"time":"2024-10-25T18:15:18.987649473Z","level":"INFO","msg":"created symlink","path":"../outputs/ppo_ti2ti_baseline_1025_with_eval/wandb/run-20241025_181518-qbvp2oju/logs/debug-core.log"}
3
+ {"time":"2024-10-25T18:15:18.991400712Z","level":"ERROR","msg":"dialing: google: could not find default credentials. See https://cloud.google.com/docs/authentication/external/set-up-adc for more information"}
4
+ {"time":"2024-10-25T18:15:19.015335603Z","level":"INFO","msg":"created new stream","id":"qbvp2oju"}
5
+ {"time":"2024-10-25T18:15:19.015397376Z","level":"INFO","msg":"stream: started","id":"qbvp2oju"}
6
+ {"time":"2024-10-25T18:15:19.015408377Z","level":"INFO","msg":"handler: started","stream_id":{"value":"qbvp2oju"}}
7
+ {"time":"2024-10-25T18:15:19.015432033Z","level":"INFO","msg":"sender: started","stream_id":{"value":"qbvp2oju"}}
8
+ {"time":"2024-10-25T18:15:19.015437112Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"qbvp2oju"}}
9
+ {"time":"2024-10-25T18:15:20.634593869Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
10
+ {"time":"2024-10-25T18:15:20.637814914Z","level":"INFO","msg":"Starting system monitor"}
11
+ {"time":"2024-10-25T19:38:13.593466266Z","level":"INFO","msg":"Stopping system monitor"}
12
+ {"time":"2024-10-25T19:38:13.627014655Z","level":"INFO","msg":"Stopped system monitor"}
13
+ {"time":"2024-10-25T19:38:14.559855674Z","level":"WARN","msg":"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"}
14
+ {"time":"2024-10-25T19:38:14.559906183Z","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
15
+ {"time":"2024-10-25T19:38:15.545457735Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
16
+ {"time":"2024-10-25T19:38:17.12240115Z","level":"INFO","msg":"stream: closing","id":"qbvp2oju"}
17
+ {"time":"2024-10-25T19:38:17.12243525Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"qbvp2oju"}}
18
+ {"time":"2024-10-25T19:38:17.122460489Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"qbvp2oju"}}
19
+ {"time":"2024-10-25T19:38:17.122575437Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"qbvp2oju"}}
20
+ {"time":"2024-10-25T19:38:17.124870943Z","level":"INFO","msg":"stream: closed","id":"qbvp2oju"}
wandb/run-20241025_181518-qbvp2oju/logs/debug.log ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Current SDK version is 0.18.3
2
+ 2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Configure stats pid to 937440
3
+ 2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Loading settings from /home/align-anything/.config/wandb/settings
4
+ 2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Loading settings from /data/align-anything/hantao/align-anything/scripts/wandb/settings
5
+ 2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'api_key': '***REDACTED***', 'mode': 'online'}
6
+ 2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
7
+ 2024-10-25 18:15:18,977 WARNING MainThread:937440 [wandb_setup.py:_flush():79] Could not find program at -m align_anything.trainers.text_image_to_text_image.ppo
8
+ 2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m align_anything.trainers.text_image_to_text_image.ppo'}
9
+ 2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Applying login settings: {}
10
+ 2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_init.py:_log_setup():532] Logging user logs to ../outputs/ppo_ti2ti_baseline_1025_with_eval/wandb/run-20241025_181518-qbvp2oju/logs/debug.log
11
+ 2024-10-25 18:15:18,978 INFO MainThread:937440 [wandb_init.py:_log_setup():533] Logging internal logs to ../outputs/ppo_ti2ti_baseline_1025_with_eval/wandb/run-20241025_181518-qbvp2oju/logs/debug-internal.log
12
+ 2024-10-25 18:15:18,978 INFO MainThread:937440 [wandb_init.py:init():617] calling init triggers
13
+ 2024-10-25 18:15:18,978 INFO MainThread:937440 [wandb_init.py:init():624] wandb.init called with sweep_config: {}
14
+ config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_prompt_batch_size': 8, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 2, 'actor_gradient_checkpointing': True, 'critic_gradient_checkpointing': True, 'actor_lr': 1e-05, 'actor_lr_scheduler_type': 'cosine', 'actor_lr_warmup_ratio': 0.03, 'actor_weight_decay': 0.01, 'critic_lr': 5e-06, 'critic_lr_scheduler_type': 'constant', 'critic_lr_warmup_ratio': 0.03, 'critic_weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'kl_coeff': 0.02, 'clip_range_ratio': 0.2, 'clip_range_score': 50.0, 'clip_range_value': 5.0, 'ptx_coeff': 16.0, 'gamma': 1.0, 'gae_lambda': 0.95, 'normalize_reward': False, 'update_iters': 1, 'freeze_mm_proj': True, 'freeze_vision_tower': False, 'freeze_language_model': True}, 'data_cfgs': {'train_datasets': '/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs', 'train_template': 'spavl_ti2ti', 'train_size': 5000, 'train_split': None, 'train_subset': None, 'train_data_files': 'ti2ti_llf_prompt_only_tokenize.pt', 'train_optional_args': [], 'eval_datasets': None, 'eval_template': None, 'eval_size': None, 'eval_split': None, 'eval_subset': None, 'eval_data_files': None, 'eval_optional_args': [], 'ptx_datasets': None, 'ptx_template': 'spavl_ti2ti', 'ptx_size': None, 'ptx_subset': None, 'ptx_split': None, 'ptx_data_files': 'ti2ti_ptx_27k.pt', 'ptx_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'ppo', 'output_dir': '../outputs/ppo_ti2ti_baseline_1025_with_eval', 'cache_dir': None, 'save_interval': 30.0}, 'model_cfgs': {'actor_model_name_or_path': '/data/align-anything/hantao/models/0916_ti_to_ti_sft', 'reward_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400', 'reward_critic_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400', 'trust_remote_code': True, 'model_max_length': 2048, 'temperature': 1.0, 'top_p': 1.0, 'repetition_penalty': 1.0}, 'lora_cfgs': {'use_lora': False, 'task_type': 'TaskType.CAUSAL_LM', 'inference_mode': False, 'r': 16, 'lora_alpha': 16, 'lora_dropout': 0.1, 'target_modules': ['q_proj', 'v_proj'], 'save_full_model': True}, 'bnb_cfgs': {'use_bnb': False, 'load_in_4bit': True, 'load_in_8bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'float16'}, 'special_tokens': None}
15
+ 2024-10-25 18:15:18,978 INFO MainThread:937440 [wandb_init.py:init():667] starting backend
16
+ 2024-10-25 18:15:18,978 INFO MainThread:937440 [wandb_init.py:init():671] sending inform_init request
17
+ 2024-10-25 18:15:18,982 INFO MainThread:937440 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
18
+ 2024-10-25 18:15:18,983 INFO MainThread:937440 [wandb_init.py:init():684] backend started and connected
19
+ 2024-10-25 18:15:18,986 INFO MainThread:937440 [wandb_init.py:init():779] updated telemetry
20
+ 2024-10-25 18:15:18,996 INFO MainThread:937440 [wandb_init.py:init():812] communicating run to backend with 90.0 second timeout
21
+ 2024-10-25 18:15:20,628 INFO MainThread:937440 [wandb_init.py:init():863] starting run threads in backend
22
+ 2024-10-25 18:15:20,774 INFO MainThread:937440 [wandb_run.py:_console_start():2465] atexit reg
23
+ 2024-10-25 18:15:20,774 INFO MainThread:937440 [wandb_run.py:_redirect():2313] redirect: wrap_raw
24
+ 2024-10-25 18:15:20,774 INFO MainThread:937440 [wandb_run.py:_redirect():2378] Wrapping output streams.
25
+ 2024-10-25 18:15:20,774 INFO MainThread:937440 [wandb_run.py:_redirect():2403] Redirects installed.
26
+ 2024-10-25 18:15:20,776 INFO MainThread:937440 [wandb_init.py:init():907] run started, returning control to user process
27
+ 2024-10-25 19:38:13,587 INFO MainThread:937440 [wandb_run.py:_finish():2164] finishing run htlou/align-anything/qbvp2oju
28
+ 2024-10-25 19:38:13,590 INFO MainThread:937440 [wandb_run.py:_atexit_cleanup():2428] got exitcode: 0
29
+ 2024-10-25 19:38:13,591 INFO MainThread:937440 [wandb_run.py:_restore():2410] restore
30
+ 2024-10-25 19:38:13,592 INFO MainThread:937440 [wandb_run.py:_restore():2416] restore done
31
+ 2024-10-25 19:38:17,104 INFO MainThread:937440 [wandb_run.py:_footer_history_summary_info():4049] rendering history
32
+ 2024-10-25 19:38:17,107 INFO MainThread:937440 [wandb_run.py:_footer_history_summary_info():4081] rendering summary
33
+ 2024-10-25 19:38:17,119 INFO MainThread:937440 [wandb_run.py:_footer_sync_info():4008] logging synced files
wandb/run-20241025_181518-qbvp2oju/run-qbvp2oju.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76ef46f86b21cc7b1a13903cb2d0aa221a447ddcdb4aff5e95e115b373ce98a4
3
+ size 4642995