htlou commited on Feb 6

Commit

24f69f0

verified ·

1 Parent(s): ca154a7

Upload folder using huggingface_hub

Browse files

Files changed (27) hide show

.gitattributes +1 -0
arguments.yaml +94 -0
config.json +0 -0
environ.txt +56 -0
preprocessor_config.json +28 -0
processor_config.json +5 -0
pytorch_model.bin +3 -0
script.sh +50 -0
special_tokens_map.json +37 -0
tokenizer.json +0 -0
tokenizer_config.json +0 -0
wandb/debug-internal.log +20 -0
wandb/debug.log +33 -0
wandb/run-20241025_180620-eoegk43l/files/output.log +0 -0
wandb/run-20241025_180620-eoegk43l/files/requirements.txt +233 -0
wandb/run-20241025_180620-eoegk43l/files/wandb-metadata.json +106 -0
wandb/run-20241025_180620-eoegk43l/logs/debug-internal.log +10 -0
wandb/run-20241025_180620-eoegk43l/logs/debug.log +26 -0
wandb/run-20241025_180620-eoegk43l/run-eoegk43l.wandb +0 -0
wandb/run-20241025_181518-qbvp2oju/files/config.yaml +143 -0
wandb/run-20241025_181518-qbvp2oju/files/output.log +307 -0
wandb/run-20241025_181518-qbvp2oju/files/requirements.txt +233 -0
wandb/run-20241025_181518-qbvp2oju/files/wandb-metadata.json +106 -0
wandb/run-20241025_181518-qbvp2oju/files/wandb-summary.json +1 -0
wandb/run-20241025_181518-qbvp2oju/logs/debug-internal.log +20 -0
wandb/run-20241025_181518-qbvp2oju/logs/debug.log +33 -0
wandb/run-20241025_181518-qbvp2oju/run-qbvp2oju.wandb +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+wandb/run-20241025_181518-qbvp2oju/run-qbvp2oju.wandb filter=lfs diff=lfs merge=lfs -text

arguments.yaml ADDED Viewed

	@@ -0,0 +1,94 @@

+bnb_cfgs:
+  bnb_4bit_compute_dtype: float16
+  bnb_4bit_quant_type: nf4
+  bnb_4bit_use_double_quant: true
+  load_in_4bit: true
+  load_in_8bit: false
+  use_bnb: false
+data_cfgs:
+  eval_data_files: null
+  eval_datasets: null
+  eval_optional_args: []
+  eval_size: null
+  eval_split: null
+  eval_subset: null
+  eval_template: null
+  ptx_data_files: ti2ti_ptx_27k.pt
+  ptx_datasets: null
+  ptx_optional_args: []
+  ptx_size: null
+  ptx_split: null
+  ptx_subset: null
+  ptx_template: spavl_ti2ti
+  train_data_files: ti2ti_llf_prompt_only_tokenize.pt
+  train_datasets: /data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs
+  train_optional_args: []
+  train_size: 5000
+  train_split: null
+  train_subset: null
+  train_template: spavl_ti2ti
+logger_cfgs:
+  cache_dir: null
+  log_project: align-anything
+  log_run_name: ppo
+  log_type: wandb
+  output_dir: ../outputs/ppo_ti2ti_baseline_1025_with_eval
+  save_interval: 30.0
+lora_cfgs:
+  inference_mode: false
+  lora_alpha: 16
+  lora_dropout: 0.1
+  r: 16
+  save_full_model: true
+  target_modules:
+  - q_proj
+  - v_proj
+  task_type: TaskType.CAUSAL_LM
+  use_lora: false
+model_cfgs:
+  actor_model_name_or_path: /data/align-anything/hantao/models/0916_ti_to_ti_sft
+  model_max_length: 2048
+  repetition_penalty: 1.0
+  reward_critic_model_name_or_path: /data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400
+  reward_model_name_or_path: /data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400
+  temperature: 1.0
+  top_p: 1.0
+  trust_remote_code: true
+special_tokens: null
+train_cfgs:
+  actor_gradient_checkpointing: true
+  actor_lr: 1.0e-05
+  actor_lr_scheduler_type: cosine
+  actor_lr_warmup_ratio: 0.03
+  actor_weight_decay: 0.01
+  adam_betas:
+  - 0.9
+  - 0.95
+  bf16: true
+  clip_range_ratio: 0.2
+  clip_range_score: 50.0
+  clip_range_value: 5.0
+  critic_gradient_checkpointing: true
+  critic_lr: 5.0e-06
+  critic_lr_scheduler_type: constant
+  critic_lr_warmup_ratio: 0.03
+  critic_weight_decay: 0.0
+  ds_cfgs: ds_z3_config.json
+  epochs: 3
+  eval_interval: 10
+  eval_strategy: epoch
+  fp16: false
+  freeze_language_model: true
+  freeze_mm_proj: true
+  freeze_vision_tower: false
+  gae_lambda: 0.95
+  gamma: 1.0
+  gradient_accumulation_steps: 2
+  kl_coeff: 0.02
+  normalize_reward: false
+  per_device_eval_batch_size: 8
+  per_device_prompt_batch_size: 8
+  per_device_train_batch_size: 8
+  ptx_coeff: 16.0
+  seed: 42
+  update_iters: 1

config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

environ.txt ADDED Viewed

	@@ -0,0 +1,56 @@

+CONDA_DEFAULT_ENV=hantao_cham
+CONDA_EXE=/home/align-anything/miniconda3/bin/conda
+CONDA_PREFIX=/home/align-anything/miniconda3/envs/hantao_cham
+CONDA_PREFIX_1=/home/align-anything/miniconda3
+CONDA_PROMPT_MODIFIER=(hantao_cham)
+CONDA_PYTHON_EXE=/home/align-anything/miniconda3/bin/python
+CONDA_SHLVL=2
+CRASHDIR=/etc/ShellCrash
+CROSS_RANK=0
+CROSS_SIZE=1
+CUDA_MODULE_LOADING=LAZY
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/2000/bus
+HOME=/home/align-anything
+LANG=en_US.UTF-8
+LD_LIBRARY_PATH=/home/align-anything/miniconda3/envs/hantao_cham/lib/python3.11/site-packages/cv2/../../lib64:
+LESSCLOSE=/usr/bin/lesspipe %s %s
+LESSOPEN=| /usr/bin/lesspipe %s
+LOCAL_RANK=0
+LOCAL_SIZE=8
+LOGLEVEL=WARNING
+LOGNAME=align-anything
+LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:
+MASTER_ADDR=127.0.0.1
+MASTER_PORT=63303
+MOTD_SHOWN=pam
+OLDPWD=/data/align-anything/hantao/align-anything/projects/text_image_to_text_image
+PATH=/home/align-anything/miniconda3/envs/hantao_cham/bin:/home/align-anything/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
+PWD=/data/align-anything/hantao/align-anything/scripts
+PYGAME_HIDE_SUPPORT_PROMPT=1
+PYTHONHASHSEED=42
+PYTHONPATH=/data/align-anything/hantao/align-anything
+QT_QPA_FONTDIR=/home/align-anything/miniconda3/envs/hantao_cham/lib/python3.11/site-packages/cv2/qt/fonts
+QT_QPA_PLATFORM_PLUGIN_PATH=/home/align-anything/miniconda3/envs/hantao_cham/lib/python3.11/site-packages/cv2/qt/plugins
+RANK=0
+SHELL=/bin/bash
+SHLVL=3
+SSH_CLIENT=111.205.230.212 28724 30500
+SSH_CONNECTION=111.205.230.212 62683 10.10.212.195 30500
+SSH_TTY=/dev/pts/2
+TERM=screen
+TMUX=/tmp/tmux-2000/default,90929,6
+TMUX_PANE=%6
+USER=align-anything
+WANDB_API_KEY=7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33
+WANDB_MODE=online
+WANDB_SERVICE=2-937440-tcp-localhost-44607
+WORLD_SIZE=8
+XDG_DATA_DIRS=/usr/local/share:/usr/share:/var/lib/snapd/desktop
+XDG_RUNTIME_DIR=/run/user/2000
+XDG_SESSION_CLASS=user
+XDG_SESSION_ID=4
+XDG_SESSION_TYPE=tty
+_=/home/align-anything/miniconda3/envs/hantao_cham/bin/deepspeed
+_CE_CONDA=
+_CE_M=

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "crop_size": {
+    "height": 512,
+    "width": 512
+  },
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    1.0,
+    1.0,
+    1.0
+  ],
+  "image_processor_type": "ChameleonImageProcessor",
+  "image_std": [
+    1.0,
+    1.0,
+    1.0
+  ],
+  "processor_class": "ChameleonProcessor",
+  "resample": 1,
+  "rescale_factor": 0.0078,
+  "size": {
+    "shortest_edge": 512
+  }
+}

processor_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "image_seq_length": 1024,
+  "image_token": "<image>",
+  "processor_class": "ChameleonProcessor"
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c201d7f9317f729765675c0940a46cd4b1675dd9ba6d5b9b5da3cdafb564faa
+size 14165009930

script.sh ADDED Viewed

	@@ -0,0 +1,50 @@

+#!/usr/bin/env bash
+#
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Initialize variables
+# For wandb online logging
+export WANDB_API_KEY="7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33"
+# Source the setup script
+# source ./setup.sh
+export WANDB_MODE=online
+ACTOR_MODEL_NAME_OR_PATH="/data/align-anything/hantao/models/0916_ti_to_ti_sft"
+CRITIC_MODEL_NAME_OR_PATH="/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400"
+REWARD_MODEL_NAME_OR_PATH="/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400"
+TRAIN_DATASETS="/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs"
+PTX_DATASETS="/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs"
+OUTPUT_DIR="../outputs/ppo_ti2ti_baseline_1025_with_eval"
+# Source the setup script
+source ./setup.sh
+# Execute deepspeed command
+deepspeed \
+  --master_port ${MASTER_PORT} \
+  --module align_anything.trainers.text_image_to_text_image.ppo \
+  --actor_model_name_or_path ${ACTOR_MODEL_NAME_OR_PATH} \
+  --reward_model_name_or_path ${REWARD_MODEL_NAME_OR_PATH} \
+  --reward_critic_model_name_or_path ${CRITIC_MODEL_NAME_OR_PATH} \
+  --train_datasets ${TRAIN_DATASETS} \
+  --train_template spavl_ti2ti \
+  --train_data_files ti2ti_llf_prompt_only_tokenize.pt \
+  --ptx_template spavl_ti2ti \
+  --ptx_data_files ti2ti_ptx_27k.pt \
+  --output_dir ${OUTPUT_DIR} \
+  --save_interval 30

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "<reserved08706>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

wandb/debug-internal.log ADDED Viewed

	@@ -0,0 +1,20 @@

+{"time":"2024-10-25T18:15:18.987617848Z","level":"INFO","msg":"using version","core version":"0.18.3"}
+{"time":"2024-10-25T18:15:18.987649473Z","level":"INFO","msg":"created symlink","path":"../outputs/ppo_ti2ti_baseline_1025_with_eval/wandb/run-20241025_181518-qbvp2oju/logs/debug-core.log"}
+{"time":"2024-10-25T18:15:18.991400712Z","level":"ERROR","msg":"dialing: google: could not find default credentials. See https://cloud.google.com/docs/authentication/external/set-up-adc for more information"}
+{"time":"2024-10-25T18:15:19.015335603Z","level":"INFO","msg":"created new stream","id":"qbvp2oju"}
+{"time":"2024-10-25T18:15:19.015397376Z","level":"INFO","msg":"stream: started","id":"qbvp2oju"}
+{"time":"2024-10-25T18:15:19.015408377Z","level":"INFO","msg":"handler: started","stream_id":{"value":"qbvp2oju"}}
+{"time":"2024-10-25T18:15:19.015432033Z","level":"INFO","msg":"sender: started","stream_id":{"value":"qbvp2oju"}}
+{"time":"2024-10-25T18:15:19.015437112Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"qbvp2oju"}}
+{"time":"2024-10-25T18:15:20.634593869Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
+{"time":"2024-10-25T18:15:20.637814914Z","level":"INFO","msg":"Starting system monitor"}
+{"time":"2024-10-25T19:38:13.593466266Z","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2024-10-25T19:38:13.627014655Z","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2024-10-25T19:38:14.559855674Z","level":"WARN","msg":"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"}
+{"time":"2024-10-25T19:38:14.559906183Z","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
+{"time":"2024-10-25T19:38:15.545457735Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2024-10-25T19:38:17.12240115Z","level":"INFO","msg":"stream: closing","id":"qbvp2oju"}
+{"time":"2024-10-25T19:38:17.12243525Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"qbvp2oju"}}
+{"time":"2024-10-25T19:38:17.122460489Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"qbvp2oju"}}
+{"time":"2024-10-25T19:38:17.122575437Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"qbvp2oju"}}
+{"time":"2024-10-25T19:38:17.124870943Z","level":"INFO","msg":"stream: closed","id":"qbvp2oju"}

wandb/debug.log ADDED Viewed

	@@ -0,0 +1,33 @@

+2024-10-25 18:15:18,977 INFO    MainThread:937440 [wandb_setup.py:_flush():79] Current SDK version is 0.18.3
+2024-10-25 18:15:18,977 INFO    MainThread:937440 [wandb_setup.py:_flush():79] Configure stats pid to 937440
+2024-10-25 18:15:18,977 INFO    MainThread:937440 [wandb_setup.py:_flush():79] Loading settings from /home/align-anything/.config/wandb/settings
+2024-10-25 18:15:18,977 INFO    MainThread:937440 [wandb_setup.py:_flush():79] Loading settings from /data/align-anything/hantao/align-anything/scripts/wandb/settings
+2024-10-25 18:15:18,977 INFO    MainThread:937440 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'api_key': '***REDACTED***', 'mode': 'online'}
+2024-10-25 18:15:18,977 INFO    MainThread:937440 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
+2024-10-25 18:15:18,977 WARNING MainThread:937440 [wandb_setup.py:_flush():79] Could not find program at -m align_anything.trainers.text_image_to_text_image.ppo
+2024-10-25 18:15:18,977 INFO    MainThread:937440 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m align_anything.trainers.text_image_to_text_image.ppo'}
+2024-10-25 18:15:18,977 INFO    MainThread:937440 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-10-25 18:15:18,977 INFO    MainThread:937440 [wandb_init.py:_log_setup():532] Logging user logs to ../outputs/ppo_ti2ti_baseline_1025_with_eval/wandb/run-20241025_181518-qbvp2oju/logs/debug.log
+2024-10-25 18:15:18,978 INFO    MainThread:937440 [wandb_init.py:_log_setup():533] Logging internal logs to ../outputs/ppo_ti2ti_baseline_1025_with_eval/wandb/run-20241025_181518-qbvp2oju/logs/debug-internal.log
+2024-10-25 18:15:18,978 INFO    MainThread:937440 [wandb_init.py:init():617] calling init triggers
+2024-10-25 18:15:18,978 INFO    MainThread:937440 [wandb_init.py:init():624] wandb.init called with sweep_config: {}
+config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_prompt_batch_size': 8, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 2, 'actor_gradient_checkpointing': True, 'critic_gradient_checkpointing': True, 'actor_lr': 1e-05, 'actor_lr_scheduler_type': 'cosine', 'actor_lr_warmup_ratio': 0.03, 'actor_weight_decay': 0.01, 'critic_lr': 5e-06, 'critic_lr_scheduler_type': 'constant', 'critic_lr_warmup_ratio': 0.03, 'critic_weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'kl_coeff': 0.02, 'clip_range_ratio': 0.2, 'clip_range_score': 50.0, 'clip_range_value': 5.0, 'ptx_coeff': 16.0, 'gamma': 1.0, 'gae_lambda': 0.95, 'normalize_reward': False, 'update_iters': 1, 'freeze_mm_proj': True, 'freeze_vision_tower': False, 'freeze_language_model': True}, 'data_cfgs': {'train_datasets': '/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs', 'train_template': 'spavl_ti2ti', 'train_size': 5000, 'train_split': None, 'train_subset': None, 'train_data_files': 'ti2ti_llf_prompt_only_tokenize.pt', 'train_optional_args': [], 'eval_datasets': None, 'eval_template': None, 'eval_size': None, 'eval_split': None, 'eval_subset': None, 'eval_data_files': None, 'eval_optional_args': [], 'ptx_datasets': None, 'ptx_template': 'spavl_ti2ti', 'ptx_size': None, 'ptx_subset': None, 'ptx_split': None, 'ptx_data_files': 'ti2ti_ptx_27k.pt', 'ptx_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'ppo', 'output_dir': '../outputs/ppo_ti2ti_baseline_1025_with_eval', 'cache_dir': None, 'save_interval': 30.0}, 'model_cfgs': {'actor_model_name_or_path': '/data/align-anything/hantao/models/0916_ti_to_ti_sft', 'reward_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400', 'reward_critic_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400', 'trust_remote_code': True, 'model_max_length': 2048, 'temperature': 1.0, 'top_p': 1.0, 'repetition_penalty': 1.0}, 'lora_cfgs': {'use_lora': False, 'task_type': 'TaskType.CAUSAL_LM', 'inference_mode': False, 'r': 16, 'lora_alpha': 16, 'lora_dropout': 0.1, 'target_modules': ['q_proj', 'v_proj'], 'save_full_model': True}, 'bnb_cfgs': {'use_bnb': False, 'load_in_4bit': True, 'load_in_8bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'float16'}, 'special_tokens': None}
+2024-10-25 18:15:18,978 INFO    MainThread:937440 [wandb_init.py:init():667] starting backend
+2024-10-25 18:15:18,978 INFO    MainThread:937440 [wandb_init.py:init():671] sending inform_init request
+2024-10-25 18:15:18,982 INFO    MainThread:937440 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-10-25 18:15:18,983 INFO    MainThread:937440 [wandb_init.py:init():684] backend started and connected
+2024-10-25 18:15:18,986 INFO    MainThread:937440 [wandb_init.py:init():779] updated telemetry
+2024-10-25 18:15:18,996 INFO    MainThread:937440 [wandb_init.py:init():812] communicating run to backend with 90.0 second timeout
+2024-10-25 18:15:20,628 INFO    MainThread:937440 [wandb_init.py:init():863] starting run threads in backend
+2024-10-25 18:15:20,774 INFO    MainThread:937440 [wandb_run.py:_console_start():2465] atexit reg
+2024-10-25 18:15:20,774 INFO    MainThread:937440 [wandb_run.py:_redirect():2313] redirect: wrap_raw
+2024-10-25 18:15:20,774 INFO    MainThread:937440 [wandb_run.py:_redirect():2378] Wrapping output streams.
+2024-10-25 18:15:20,774 INFO    MainThread:937440 [wandb_run.py:_redirect():2403] Redirects installed.
+2024-10-25 18:15:20,776 INFO    MainThread:937440 [wandb_init.py:init():907] run started, returning control to user process
+2024-10-25 19:38:13,587 INFO    MainThread:937440 [wandb_run.py:_finish():2164] finishing run htlou/align-anything/qbvp2oju
+2024-10-25 19:38:13,590 INFO    MainThread:937440 [wandb_run.py:_atexit_cleanup():2428] got exitcode: 0
+2024-10-25 19:38:13,591 INFO    MainThread:937440 [wandb_run.py:_restore():2410] restore
+2024-10-25 19:38:13,592 INFO    MainThread:937440 [wandb_run.py:_restore():2416] restore done
+2024-10-25 19:38:17,104 INFO    MainThread:937440 [wandb_run.py:_footer_history_summary_info():4049] rendering history
+2024-10-25 19:38:17,107 INFO    MainThread:937440 [wandb_run.py:_footer_history_summary_info():4081] rendering summary
+2024-10-25 19:38:17,119 INFO    MainThread:937440 [wandb_run.py:_footer_sync_info():4008] logging synced files

wandb/run-20241025_180620-eoegk43l/files/output.log ADDED Viewed

File without changes

wandb/run-20241025_180620-eoegk43l/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,233 @@

+align-anything==0.0.1.dev0
+torch==2.4.0
+pycparser==2.22
+torchvision==0.19.0
+multiprocess==0.70.16
+braceexpand==0.1.7
+lm-format-enforcer==0.10.6
+Jinja2==3.1.4
+scikit-learn==1.5.2
+interegular==0.3.3
+starlette==0.38.6
+huggingface-hub==0.25.2
+pyairports==2.1.1
+protobuf==3.20.3
+term-image==0.7.2
+python-dateutil==2.9.0.post0
+identify==2.6.1
+tokenizers==0.19.1
+tensorboard-data-server==0.7.2
+numba==0.60.0
+ninja==1.11.1.1
+nvidia-cuda-cupti-cu12==12.1.105
+diskcache==5.6.3
+pycountry==24.6.1
+py-cpuinfo==9.0.0
+scipy==1.14.1
+soxr==0.5.0.post1
+prometheus-fastapi-instrumentator==7.0.0
+align-anything==0.0.1.dev0
+virtualenv==20.26.6
+hjson==3.1.0
+nvidia-cudnn-cu12==9.1.0.70
+termcolor==2.5.0
+grpcio==1.66.2
+wheel==0.44.0
+torchlibrosa==0.1.0
+numpy==1.26.4
+msgpack==1.1.0
+rpds-py==0.20.0
+annotated-types==0.7.0
+pre_commit==4.0.1
+aiohttp==3.10.10
+audioread==3.0.1
+lazy_loader==0.4
+nvidia-cuda-runtime-cu12==12.1.105
+filelock==3.16.1
+timm==0.6.13
+anyio==4.6.0
+pydantic_core==2.23.4
+idna==3.10
+fastapi==0.115.0
+wandb==0.18.3
+packaging==24.1
+yt-dlp==2024.8.6
+matplotlib==3.9.2
+websockets==12.0
+triton==3.0.0
+zipp==3.20.2
+requests==2.32.3
+xxhash==3.5.0
+image-reward==1.5
+pytorch-fid==0.3.0
+imageio-ffmpeg==0.5.1
+args==0.1.0
+llvmlite==0.43.0
+peft==0.13.2
+openai==1.51.2
+httpx==0.27.2
+nvidia-cublas-cu12==12.1.3.1
+pytest-split==0.8.0
+ruff==0.6.9
+sniffio==1.3.1
+yarl==1.15.0
+pandas==2.2.3
+fsspec==2024.6.1
+gguf==0.10.0
+diffusers==0.30.3
+platformdirs==4.3.6
+nvidia-cuda-nvrtc-cu12==12.1.105
+imageio==2.35.1
+Brotli==1.1.0
+bitsandbytes==0.44.1
+hpsv2==1.2.0
+lark==1.2.2
+gradio==5.0.2
+pydantic==2.9.2
+pytz==2024.2
+jsonschema-specifications==2024.10.1
+deepspeed==0.15.2
+cloudpickle==3.1.0
+distro==1.9.0
+aiohappyeyeballs==2.4.3
+Markdown==3.7
+docker-pycreds==0.4.0
+semantic-version==2.10.0
+resampy==0.4.3
+urllib3==2.2.3
+nodeenv==1.9.1
+click==8.1.7
+accelerate==1.0.1
+dill==0.3.8
+setproctitle==1.3.3
+httpcore==1.0.6
+pooch==1.8.2
+importlib_metadata==8.5.0
+cfgv==3.4.0
+einops==0.8.0
+shellingham==1.5.4
+pytest==7.2.0
+python-dotenv==1.0.1
+pydub==0.25.1
+kiwisolver==1.4.7
+aiofiles==23.2.1
+vllm==0.6.2
+Werkzeug==3.0.4
+tensorboard==2.18.0
+joblib==1.4.2
+pycryptodomex==3.21.0
+moviepy==1.0.3
+typing_extensions==4.12.2
+mdurl==0.1.2
+mistral_common==1.4.4
+rich==13.9.2
+aiosignal==1.3.1
+mmsg==0.1.dev20+g585c63a.d20241012
+pillow==10.4.0
+prometheus_client==0.21.0
+nvidia-cusolver-cu12==11.4.5.107
+typer==0.12.5
+pyzmq==26.2.0
+h11==0.14.0
+gitdb==4.0.11
+transformers==4.44.0.dev0
+nvidia-nccl-cu12==2.20.5
+jsonschema==4.23.0
+soundfile==0.12.1
+contourpy==1.3.0
+mutagen==1.47.0
+regex==2024.9.11
+orjson==3.10.7
+fairscale==0.4.13
+partial-json-parser==0.2.1.1.post4
+outlines==0.1.1.dev4+ga2fd35c
+nvidia-curand-cu12==10.3.2.106
+pluggy==1.5.0
+GitPython==3.1.43
+tzdata==2024.2
+uvicorn==0.31.1
+sentencepiece==0.2.0
+decorator==4.4.2
+nvidia-nvjitlink-cu12==12.6.77
+distlib==0.3.9
+uvloop==0.20.0
+networkx==3.4.1
+wcwidth==0.2.13
+opencv-python==4.6.0.66
+six==1.16.0
+httptools==0.6.1
+safetensors==0.4.5
+nvidia-nvtx-cu12==12.1.105
+markdown-it-py==3.0.0
+certifi==2024.8.30
+sentry-sdk==2.16.0
+outlines_core==0.1.0
+threadpoolctl==3.5.0
+nvidia-cufft-cu12==11.0.2.54
+datasets==3.0.1
+cycler==0.12.1
+psutil==6.0.0
+nvidia-cusparse-cu12==12.1.0.106
+shortuuid==1.0.13
+ffmpy==0.4.0
+xformers==0.0.27.post2
+MarkupSafe==2.1.5
+tqdm==4.66.5
+gradio_client==1.4.0
+attrs==24.2.0
+optree==0.13.0
+PyYAML==6.0.2
+clint==0.5.1
+torchaudio==2.4.0
+frechet-audio-distance==0.1.2
+frozenlist==1.4.1
+clip==0.2.0
+multidict==6.1.0
+propcache==0.2.0
+librosa==0.10.2.post1
+webdataset==0.2.100
+ray==2.37.0
+pyparsing==3.1.4
+pyarrow==17.0.0
+tiktoken==0.7.0
+watchfiles==0.24.0
+proglog==0.1.10
+cachetools==5.5.0
+fonttools==4.54.1
+charset-normalizer==3.4.0
+ftfy==6.3.0
+referencing==0.35.1
+mpmath==1.3.0
+msgspec==0.18.6
+nvidia-ml-py==12.535.161
+smmap==5.0.1
+absl-py==2.1.0
+python-multipart==0.0.12
+Pygments==2.18.0
+iniconfig==2.0.0
+sympy==1.13.3
+pip==24.2
+airportsdata==20241001
+tomlkit==0.12.0
+nest-asyncio==1.6.0
+setuptools==75.1.0
+jiter==0.6.1
+cffi==1.17.1
+nvitop==1.3.2
+backports.tarfile==1.2.0
+zipp==3.19.2
+inflect==7.3.1
+autocommand==2.2.2
+importlib_resources==6.4.0
+packaging==24.1
+jaraco.context==5.3.0
+typeguard==4.3.0
+more-itertools==10.3.0
+jaraco.text==3.12.1
+platformdirs==4.2.2
+wheel==0.43.0
+typing_extensions==4.12.2
+importlib_metadata==8.0.0
+tomli==2.0.1
+jaraco.collections==5.1.0
+jaraco.functools==4.0.1

wandb/run-20241025_180620-eoegk43l/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,106 @@

+{
+  "os":  "Linux-5.4.0-198-generic-x86_64-with-glibc2.31",
+  "python":  "3.11.10",
+  "startedAt":  "2024-10-25T18:06:20.375892Z",
+  "args":  [
+    "--local_rank=0",
+    "--actor_model_name_or_path",
+    "/data/align-anything/hantao/models/0916_ti_to_ti_sft",
+    "--reward_model_name_or_path",
+    "/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400",
+    "--reward_critic_model_name_or_path",
+    "/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400",
+    "--train_datasets",
+    "/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs",
+    "--train_template",
+    "spavl_ti2ti",
+    "--train_data_files",
+    "ti2ti_preference_filtered_tokenize_full.pt",
+    "--ptx_template",
+    "spavl_ti2ti",
+    "--ptx_data_files",
+    "ti2ti_ptx_27k.pt",
+    "--output_dir",
+    "../outputs/ppo_ti2ti_baseline_1025_with_eval",
+    "--save_interval",
+    "30"
+  ],
+  "program":  "-m align_anything.trainers.text_image_to_text_image.ppo",
+  "git":  {
+    "remote":  "https://github.com/PKU-Alignment/align-anything.git",
+    "commit":  "6fde660afc9985323f147930eedf188a5699adc7"
+  },
+  "email":  "[email protected]",
+  "root":  "../outputs/ppo_ti2ti_baseline_1025_with_eval",
+  "host":  "lyg0195",
+  "username":  "align-anything",
+  "executable":  "/home/align-anything/miniconda3/envs/hantao_cham/bin/python",
+  "cpu_count":  64,
+  "cpu_count_logical":  128,
+  "gpu":  "[NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB]",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "938421047296",
+      "used":  "363102785536"
+    }
+  },
+  "memory":  {
+    "total":  "540647575552"
+  },
+  "cpu":  {
+    "count":  64,
+    "countLogical":  128
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "12.4"
+}

wandb/run-20241025_180620-eoegk43l/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,10 @@

+{"time":"2024-10-25T18:06:20.381500476Z","level":"INFO","msg":"using version","core version":"0.18.3"}
+{"time":"2024-10-25T18:06:20.381537039Z","level":"INFO","msg":"created symlink","path":"../outputs/ppo_ti2ti_baseline_1025_with_eval/wandb/run-20241025_180620-eoegk43l/logs/debug-core.log"}
+{"time":"2024-10-25T18:06:20.386238205Z","level":"ERROR","msg":"dialing: google: could not find default credentials. See https://cloud.google.com/docs/authentication/external/set-up-adc for more information"}
+{"time":"2024-10-25T18:06:20.408793678Z","level":"INFO","msg":"created new stream","id":"eoegk43l"}
+{"time":"2024-10-25T18:06:20.408868821Z","level":"INFO","msg":"stream: started","id":"eoegk43l"}
+{"time":"2024-10-25T18:06:20.408926558Z","level":"INFO","msg":"sender: started","stream_id":{"value":"eoegk43l"}}
+{"time":"2024-10-25T18:06:20.408909461Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"eoegk43l"}}
+{"time":"2024-10-25T18:06:20.408921169Z","level":"INFO","msg":"handler: started","stream_id":{"value":"eoegk43l"}}
+{"time":"2024-10-25T18:06:21.029852323Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
+{"time":"2024-10-25T18:06:21.033290741Z","level":"INFO","msg":"Starting system monitor"}

wandb/run-20241025_180620-eoegk43l/logs/debug.log ADDED Viewed

	@@ -0,0 +1,26 @@

+2024-10-25 18:06:20,368 INFO    MainThread:935352 [wandb_setup.py:_flush():79] Current SDK version is 0.18.3
+2024-10-25 18:06:20,368 INFO    MainThread:935352 [wandb_setup.py:_flush():79] Configure stats pid to 935352
+2024-10-25 18:06:20,369 INFO    MainThread:935352 [wandb_setup.py:_flush():79] Loading settings from /home/align-anything/.config/wandb/settings
+2024-10-25 18:06:20,369 INFO    MainThread:935352 [wandb_setup.py:_flush():79] Loading settings from /data/align-anything/hantao/align-anything/scripts/wandb/settings
+2024-10-25 18:06:20,369 INFO    MainThread:935352 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'api_key': '***REDACTED***', 'mode': 'online'}
+2024-10-25 18:06:20,369 INFO    MainThread:935352 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
+2024-10-25 18:06:20,369 WARNING MainThread:935352 [wandb_setup.py:_flush():79] Could not find program at -m align_anything.trainers.text_image_to_text_image.ppo
+2024-10-25 18:06:20,369 INFO    MainThread:935352 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m align_anything.trainers.text_image_to_text_image.ppo'}
+2024-10-25 18:06:20,369 INFO    MainThread:935352 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-10-25 18:06:20,369 INFO    MainThread:935352 [wandb_init.py:_log_setup():532] Logging user logs to ../outputs/ppo_ti2ti_baseline_1025_with_eval/wandb/run-20241025_180620-eoegk43l/logs/debug.log
+2024-10-25 18:06:20,369 INFO    MainThread:935352 [wandb_init.py:_log_setup():533] Logging internal logs to ../outputs/ppo_ti2ti_baseline_1025_with_eval/wandb/run-20241025_180620-eoegk43l/logs/debug-internal.log
+2024-10-25 18:06:20,369 INFO    MainThread:935352 [wandb_init.py:init():617] calling init triggers
+2024-10-25 18:06:20,369 INFO    MainThread:935352 [wandb_init.py:init():624] wandb.init called with sweep_config: {}
+config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_prompt_batch_size': 8, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 2, 'actor_gradient_checkpointing': True, 'critic_gradient_checkpointing': True, 'actor_lr': 1e-05, 'actor_lr_scheduler_type': 'cosine', 'actor_lr_warmup_ratio': 0.03, 'actor_weight_decay': 0.01, 'critic_lr': 5e-06, 'critic_lr_scheduler_type': 'constant', 'critic_lr_warmup_ratio': 0.03, 'critic_weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'kl_coeff': 0.02, 'clip_range_ratio': 0.2, 'clip_range_score': 50.0, 'clip_range_value': 5.0, 'ptx_coeff': 16.0, 'gamma': 1.0, 'gae_lambda': 0.95, 'normalize_reward': False, 'update_iters': 1, 'freeze_mm_proj': True, 'freeze_vision_tower': False, 'freeze_language_model': True}, 'data_cfgs': {'train_datasets': '/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs', 'train_template': 'spavl_ti2ti', 'train_size': 5000, 'train_split': None, 'train_subset': None, 'train_data_files': 'ti2ti_preference_filtered_tokenize_full.pt', 'train_optional_args': [], 'eval_datasets': None, 'eval_template': None, 'eval_size': None, 'eval_split': None, 'eval_subset': None, 'eval_data_files': None, 'eval_optional_args': [], 'ptx_datasets': None, 'ptx_template': 'spavl_ti2ti', 'ptx_size': None, 'ptx_subset': None, 'ptx_split': None, 'ptx_data_files': 'ti2ti_ptx_27k.pt', 'ptx_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'ppo', 'output_dir': '../outputs/ppo_ti2ti_baseline_1025_with_eval', 'cache_dir': None, 'save_interval': 30.0}, 'model_cfgs': {'actor_model_name_or_path': '/data/align-anything/hantao/models/0916_ti_to_ti_sft', 'reward_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400', 'reward_critic_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400', 'trust_remote_code': True, 'model_max_length': 2048, 'temperature': 1.0, 'top_p': 1.0, 'repetition_penalty': 1.0}, 'lora_cfgs': {'use_lora': False, 'task_type': 'TaskType.CAUSAL_LM', 'inference_mode': False, 'r': 16, 'lora_alpha': 16, 'lora_dropout': 0.1, 'target_modules': ['q_proj', 'v_proj'], 'save_full_model': True}, 'bnb_cfgs': {'use_bnb': False, 'load_in_4bit': True, 'load_in_8bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'float16'}, 'special_tokens': None}
+2024-10-25 18:06:20,370 INFO    MainThread:935352 [wandb_init.py:init():667] starting backend
+2024-10-25 18:06:20,370 INFO    MainThread:935352 [wandb_init.py:init():671] sending inform_init request
+2024-10-25 18:06:20,374 INFO    MainThread:935352 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-10-25 18:06:20,375 INFO    MainThread:935352 [wandb_init.py:init():684] backend started and connected
+2024-10-25 18:06:20,379 INFO    MainThread:935352 [wandb_init.py:init():779] updated telemetry
+2024-10-25 18:06:20,389 INFO    MainThread:935352 [wandb_init.py:init():812] communicating run to backend with 90.0 second timeout
+2024-10-25 18:06:21,024 INFO    MainThread:935352 [wandb_init.py:init():863] starting run threads in backend
+2024-10-25 18:06:21,186 INFO    MainThread:935352 [wandb_run.py:_console_start():2465] atexit reg
+2024-10-25 18:06:21,186 INFO    MainThread:935352 [wandb_run.py:_redirect():2313] redirect: wrap_raw
+2024-10-25 18:06:21,186 INFO    MainThread:935352 [wandb_run.py:_redirect():2378] Wrapping output streams.
+2024-10-25 18:06:21,186 INFO    MainThread:935352 [wandb_run.py:_redirect():2403] Redirects installed.
+2024-10-25 18:06:21,189 INFO    MainThread:935352 [wandb_init.py:init():907] run started, returning control to user process

wandb/run-20241025_180620-eoegk43l/run-eoegk43l.wandb ADDED Viewed

File without changes

wandb/run-20241025_181518-qbvp2oju/files/config.yaml ADDED Viewed

	@@ -0,0 +1,143 @@

+_wandb:
+    value:
+        cli_version: 0.18.3
+        m: []
+        python_version: 3.11.10
+        t:
+            "1":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 55
+                - 71
+                - 83
+                - 98
+                - 105
+            "2":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 55
+                - 71
+                - 83
+                - 98
+                - 105
+            "3":
+                - 2
+                - 13
+                - 16
+                - 23
+                - 55
+                - 61
+            "4": 3.11.10
+            "5": 0.18.3
+            "6": 4.44.0.dev0
+            "8":
+                - 5
+            "12": 0.18.3
+            "13": linux-x86_64
+bnb_cfgs:
+    value:
+        bnb_4bit_compute_dtype: float16
+        bnb_4bit_quant_type: nf4
+        bnb_4bit_use_double_quant: true
+        load_in_4bit: true
+        load_in_8bit: false
+        use_bnb: false
+data_cfgs:
+    value:
+        eval_data_files: null
+        eval_datasets: null
+        eval_optional_args: []
+        eval_size: null
+        eval_split: null
+        eval_subset: null
+        eval_template: null
+        ptx_data_files: ti2ti_ptx_27k.pt
+        ptx_datasets: null
+        ptx_optional_args: []
+        ptx_size: null
+        ptx_split: null
+        ptx_subset: null
+        ptx_template: spavl_ti2ti
+        train_data_files: ti2ti_llf_prompt_only_tokenize.pt
+        train_datasets: /data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs
+        train_optional_args: []
+        train_size: 5000
+        train_split: null
+        train_subset: null
+        train_template: spavl_ti2ti
+logger_cfgs:
+    value:
+        cache_dir: null
+        log_project: align-anything
+        log_run_name: ppo
+        log_type: wandb
+        output_dir: ../outputs/ppo_ti2ti_baseline_1025_with_eval
+        save_interval: 30
+lora_cfgs:
+    value:
+        inference_mode: false
+        lora_alpha: 16
+        lora_dropout: 0.1
+        r: 16
+        save_full_model: true
+        target_modules:
+            - q_proj
+            - v_proj
+        task_type: TaskType.CAUSAL_LM
+        use_lora: false
+model_cfgs:
+    value:
+        actor_model_name_or_path: /data/align-anything/hantao/models/0916_ti_to_ti_sft
+        model_max_length: 2048
+        repetition_penalty: 1
+        reward_critic_model_name_or_path: /data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400
+        reward_model_name_or_path: /data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400
+        temperature: 1
+        top_p: 1
+        trust_remote_code: true
+special_tokens:
+    value: null
+train_cfgs:
+    value:
+        actor_gradient_checkpointing: true
+        actor_lr: 1e-05
+        actor_lr_scheduler_type: cosine
+        actor_lr_warmup_ratio: 0.03
+        actor_weight_decay: 0.01
+        adam_betas:
+            - 0.9
+            - 0.95
+        bf16: true
+        clip_range_ratio: 0.2
+        clip_range_score: 50
+        clip_range_value: 5
+        critic_gradient_checkpointing: true
+        critic_lr: 5e-06
+        critic_lr_scheduler_type: constant
+        critic_lr_warmup_ratio: 0.03
+        critic_weight_decay: 0
+        ds_cfgs: ds_z3_config.json
+        epochs: 3
+        eval_interval: 10
+        eval_strategy: epoch
+        fp16: false
+        freeze_language_model: true
+        freeze_mm_proj: true
+        freeze_vision_tower: false
+        gae_lambda: 0.95
+        gamma: 1
+        gradient_accumulation_steps: 2
+        kl_coeff: 0.02
+        normalize_reward: false
+        per_device_eval_batch_size: 8
+        per_device_prompt_batch_size: 8
+        per_device_train_batch_size: 8
+        ptx_coeff: 16
+        seed: 42
+        update_iters: 1

wandb/run-20241025_181518-qbvp2oju/files/output.log ADDED Viewed

	@@ -0,0 +1,307 @@

+***** Running training *****
+Training 1/3 epoch:   0%|                                                                                                                                                                                                                                     | 0/237 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
+[2024-10-25 18:23:22,854] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:23:27,196] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:27:32,436] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:27:37,446] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:32:36,133] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:32:41,569] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:36:08,160] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:36:12,414] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:39:02,940] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:39:07,161] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:41:58,177] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:42:02,629] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:43:57,357] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:44:01,125] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:45:40,196] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:45:44,071] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:47:40,184] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:47:40,185] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[9.908858470377793e-06, 9.908858470377793e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-25 18:47:40,185] [INFO] [timer.py:264:stop] epoch=0/micro_step=20/global_step=10, RunningAvgSamplesPerSec=14.760689632781455, CurrSamplesPerSec=16.90950583092757, MemAllocated=33.11GB, MaxMemAllocated=47.22GB
+[2024-10-25 18:47:43,802] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:47:43,802] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-25 18:47:43,803] [INFO] [timer.py:264:stop] epoch=0/micro_step=20/global_step=10, RunningAvgSamplesPerSec=15.436775313806873, CurrSamplesPerSec=17.42788205848213, MemAllocated=33.11GB, MaxMemAllocated=47.22GB
+[2024-10-25 18:48:48,658] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:48:52,182] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:49:14,049] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:49:17,288] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:49:38,002] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:49:41,318] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:50:04,280] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:50:07,681] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+Saving checkpoint at step 30 ...
+Saving model to "../outputs/ppo_ti2ti_baseline_1025_with_eval" ...
+Saving 16-bit model...
+[2024-10-25 18:50:21,078] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step15 is about to be saved!
+[2024-10-25 18:50:21,079] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_30.bin, tag: global_step15
+[2024-10-25 18:50:21,079] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_30.bin...
+[2024-10-25 18:50:36,754] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_30.bin.
+[2024-10-25 18:50:36,755] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step15 is ready now!
+Model saved!
+Saving 16-bit model...
+[2024-10-25 18:50:47,513] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step15 is about to be saved!
+[2024-10-25 18:50:47,514] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_30.bin, tag: global_step15
+[2024-10-25 18:50:47,514] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_30.bin...
+[2024-10-25 18:51:06,021] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_30.bin.
+[2024-10-25 18:51:06,021] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step15 is ready now!
+Model saved!
+Model saved!
+Checkpoint saved.
+[2024-10-25 18:51:27,052] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:51:30,347] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:51:50,999] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:51:54,300] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:52:19,971] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:52:23,379] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:52:45,642] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:52:48,945] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:53:09,437] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:53:09,438] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[9.470431355738257e-06, 9.470431355738257e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-25 18:53:09,438] [INFO] [timer.py:264:stop] epoch=0/micro_step=40/global_step=20, RunningAvgSamplesPerSec=16.735259629101844, CurrSamplesPerSec=18.7639256774801, MemAllocated=33.18GB, MaxMemAllocated=47.22GB
+[2024-10-25 18:53:12,725] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:53:12,725] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-25 18:53:12,726] [INFO] [timer.py:264:stop] epoch=0/micro_step=40/global_step=20, RunningAvgSamplesPerSec=17.458411197853028, CurrSamplesPerSec=19.786118059003716, MemAllocated=33.18GB, MaxMemAllocated=47.22GB
+[2024-10-25 18:54:19,714] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:54:23,021] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:55:07,181] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:55:10,462] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:55:30,696] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:55:34,030] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:55:54,073] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:55:57,348] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:57:04,748] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[8.70045279830626e-06, 8.70045279830626e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-25 18:57:04,749] [INFO] [timer.py:264:stop] epoch=0/micro_step=60/global_step=30, RunningAvgSamplesPerSec=17.50081132260227, CurrSamplesPerSec=19.186032450205442, MemAllocated=33.15GB, MaxMemAllocated=47.22GB
+[2024-10-25 18:57:08,050] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-25 18:57:08,051] [INFO] [timer.py:264:stop] epoch=0/micro_step=60/global_step=30, RunningAvgSamplesPerSec=18.204578219677924, CurrSamplesPerSec=19.780692782398315, MemAllocated=33.15GB, MaxMemAllocated=47.22GB
+Saving checkpoint at step 60 ...
+Saving model to "../outputs/ppo_ti2ti_baseline_1025_with_eval" ...
+Saving 16-bit model...
+[2024-10-25 18:57:21,740] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step30 is about to be saved!
+[2024-10-25 18:57:21,741] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_60.bin, tag: global_step30
+[2024-10-25 18:57:21,741] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_60.bin...
+[2024-10-25 18:57:38,185] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_60.bin.
+[2024-10-25 18:57:38,186] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step30 is ready now!
+Model saved!
+Saving 16-bit model...
+[2024-10-25 18:57:46,170] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step30 is about to be saved!
+[2024-10-25 18:57:46,171] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_60.bin, tag: global_step30
+[2024-10-25 18:57:46,171] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_60.bin...
+[2024-10-25 18:58:04,694] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_60.bin.
+[2024-10-25 18:58:04,694] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step30 is ready now!
+Model saved!
+Model saved!
+Checkpoint saved.
+[2024-10-25 18:59:12,139] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:59:15,443] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:59:35,560] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 18:59:38,857] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:01:48,333] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=0, lr=[7.656028585269017e-06, 7.656028585269017e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-25 19:01:48,334] [INFO] [timer.py:264:stop] epoch=0/micro_step=80/global_step=40, RunningAvgSamplesPerSec=18.027035548451977, CurrSamplesPerSec=29.34925562325487, MemAllocated=33.13GB, MaxMemAllocated=47.22GB
+[2024-10-25 19:01:51,645] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-25 19:01:51,646] [INFO] [timer.py:264:stop] epoch=0/micro_step=80/global_step=40, RunningAvgSamplesPerSec=18.74402073106821, CurrSamplesPerSec=30.575559928357755, MemAllocated=33.13GB, MaxMemAllocated=47.22GB
+[2024-10-25 19:02:12,030] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:02:15,280] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:02:58,678] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:03:01,948] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:03:22,304] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:03:25,588] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:03:45,571] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:03:48,886] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+Saving checkpoint at step 90 ...
+Saving model to "../outputs/ppo_ti2ti_baseline_1025_with_eval" ...
+Saving 16-bit model...
+[2024-10-25 19:04:00,883] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step45 is about to be saved!
+[2024-10-25 19:04:00,885] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_90.bin, tag: global_step45
+[2024-10-25 19:04:00,885] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_90.bin...
+[2024-10-25 19:04:18,000] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_90.bin.
+[2024-10-25 19:04:18,001] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step45 is ready now!
+Model saved!
+Saving 16-bit model...
+[2024-10-25 19:04:26,278] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step45 is about to be saved!
+[2024-10-25 19:04:26,279] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_90.bin, tag: global_step45
+[2024-10-25 19:04:26,279] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_90.bin...
+[2024-10-25 19:04:45,735] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_90.bin.
+[2024-10-25 19:04:45,737] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step45 is ready now!
+Model saved!
+Model saved!
+Checkpoint saved.
+[2024-10-25 19:06:15,770] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:06:19,105] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:06:39,673] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:06:39,674] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=0, lr=[6.41461888258465e-06, 6.41461888258465e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-25 19:06:39,675] [INFO] [timer.py:264:stop] epoch=0/micro_step=100/global_step=50, RunningAvgSamplesPerSec=18.196574738389252, CurrSamplesPerSec=18.16333305616454, MemAllocated=33.11GB, MaxMemAllocated=47.22GB
+[2024-10-25 19:06:43,151] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:06:43,152] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-25 19:06:43,153] [INFO] [timer.py:264:stop] epoch=0/micro_step=100/global_step=50, RunningAvgSamplesPerSec=18.924891588094603, CurrSamplesPerSec=19.22042501406237, MemAllocated=33.11GB, MaxMemAllocated=47.22GB
+[2024-10-25 19:08:14,506] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:08:17,754] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:09:25,071] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:09:28,498] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:10:35,663] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:10:35,664] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=0, lr=[5.068293368829755e-06, 5.068293368829755e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-25 19:10:35,665] [INFO] [timer.py:264:stop] epoch=0/micro_step=120/global_step=60, RunningAvgSamplesPerSec=18.322746746684054, CurrSamplesPerSec=19.371088818318672, MemAllocated=33.14GB, MaxMemAllocated=47.22GB
+[2024-10-25 19:10:38,940] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:10:38,940] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-25 19:10:38,941] [INFO] [timer.py:264:stop] epoch=0/micro_step=120/global_step=60, RunningAvgSamplesPerSec=19.029045377480585, CurrSamplesPerSec=19.888041858045842, MemAllocated=33.14GB, MaxMemAllocated=47.22GB
+Saving checkpoint at step 120 ...
+Saving model to "../outputs/ppo_ti2ti_baseline_1025_with_eval" ...
+Saving 16-bit model...
+[2024-10-25 19:10:54,426] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step60 is about to be saved!
+[2024-10-25 19:10:54,428] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_120.bin, tag: global_step60
+[2024-10-25 19:10:54,428] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_120.bin...
+[2024-10-25 19:11:13,388] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_120.bin.
+[2024-10-25 19:11:13,390] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step60 is ready now!
+Model saved!
+Saving 16-bit model...
+[2024-10-25 19:11:22,464] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step60 is about to be saved!
+[2024-10-25 19:11:22,465] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_120.bin, tag: global_step60
+[2024-10-25 19:11:22,466] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_120.bin...
+[2024-10-25 19:11:39,535] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_120.bin.
+[2024-10-25 19:11:39,535] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step60 is ready now!
+Model saved!
+Model saved!
+Checkpoint saved.
+[2024-10-25 19:11:59,374] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:12:02,660] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:12:22,755] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:12:26,038] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:12:46,474] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:12:49,933] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:13:10,102] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:13:13,472] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:14:20,491] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:14:23,905] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:15:07,328] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:15:10,628] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:15:30,619] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=0, lr=[3.7169028483301333e-06, 3.7169028483301333e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-25 19:15:30,619] [INFO] [timer.py:264:stop] epoch=0/micro_step=140/global_step=70, RunningAvgSamplesPerSec=18.421273695026542, CurrSamplesPerSec=19.229435871608736, MemAllocated=33.13GB, MaxMemAllocated=47.22GB
+[2024-10-25 19:15:33,910] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-25 19:15:33,911] [INFO] [timer.py:264:stop] epoch=0/micro_step=140/global_step=70, RunningAvgSamplesPerSec=19.12443035480107, CurrSamplesPerSec=19.765868285523876, MemAllocated=33.13GB, MaxMemAllocated=47.22GB
+[2024-10-25 19:15:53,825] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:15:57,143] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:16:17,587] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:16:20,871] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:16:40,921] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:16:44,213] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:17:04,343] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:17:07,629] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+Saving checkpoint at step 150 ...
+Saving model to "../outputs/ppo_ti2ti_baseline_1025_with_eval" ...
+Saving 16-bit model...
+[2024-10-25 19:17:42,434] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step75 is about to be saved!
+[2024-10-25 19:17:42,436] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_150.bin, tag: global_step75
+[2024-10-25 19:17:42,436] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_150.bin...
+[2024-10-25 19:18:02,484] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_150.bin.
+[2024-10-25 19:18:02,486] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step75 is ready now!
+Model saved!
+Saving 16-bit model...
+[2024-10-25 19:18:11,754] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step75 is about to be saved!
+[2024-10-25 19:18:11,755] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_150.bin, tag: global_step75
+[2024-10-25 19:18:11,755] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_150.bin...
+[2024-10-25 19:18:28,942] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_150.bin.
+[2024-10-25 19:18:28,944] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step75 is ready now!
+Model saved!
+Model saved!
+Checkpoint saved.
+[2024-10-25 19:18:48,635] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:18:51,897] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:20:14,068] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:20:14,069] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=0, lr=[2.4606737737909696e-06, 2.4606737737909696e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-25 19:20:14,070] [INFO] [timer.py:264:stop] epoch=0/micro_step=160/global_step=80, RunningAvgSamplesPerSec=18.583796025985635, CurrSamplesPerSec=18.851139953128662, MemAllocated=33.29GB, MaxMemAllocated=47.22GB
+[2024-10-25 19:20:17,500] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:20:17,500] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-25 19:20:17,501] [INFO] [timer.py:264:stop] epoch=0/micro_step=160/global_step=80, RunningAvgSamplesPerSec=19.286477338971316, CurrSamplesPerSec=19.326753490145233, MemAllocated=33.29GB, MaxMemAllocated=47.22GB
+[2024-10-25 19:21:24,498] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:21:27,888] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:21:47,842] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:21:51,137] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:22:11,123] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:22:14,405] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:22:57,887] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:23:01,180] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:23:44,745] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:23:48,058] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:24:08,920] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=0, lr=[1.3927749088052218e-06, 1.3927749088052218e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-25 19:24:08,921] [INFO] [timer.py:264:stop] epoch=0/micro_step=180/global_step=90, RunningAvgSamplesPerSec=18.631064887248623, CurrSamplesPerSec=18.182446863655244, MemAllocated=33.13GB, MaxMemAllocated=47.22GB
+[2024-10-25 19:24:12,425] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-25 19:24:12,426] [INFO] [timer.py:264:stop] epoch=0/micro_step=180/global_step=90, RunningAvgSamplesPerSec=19.325484075258544, CurrSamplesPerSec=18.68740115377941, MemAllocated=33.13GB, MaxMemAllocated=47.22GB
+Saving checkpoint at step 180 ...
+Saving model to "../outputs/ppo_ti2ti_baseline_1025_with_eval" ...
+Saving 16-bit model...
+[2024-10-25 19:24:23,716] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step90 is about to be saved!
+[2024-10-25 19:24:23,717] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_180.bin, tag: global_step90
+[2024-10-25 19:24:23,717] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_180.bin...
+[2024-10-25 19:24:41,475] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_180.bin.
+[2024-10-25 19:24:41,476] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step90 is ready now!
+Model saved!
+Saving 16-bit model...
+[2024-10-25 19:24:50,478] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step90 is about to be saved!
+[2024-10-25 19:24:50,479] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_180.bin, tag: global_step90
+[2024-10-25 19:24:50,480] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_180.bin...
+[2024-10-25 19:25:12,036] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_180.bin.
+[2024-10-25 19:25:12,039] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step90 is ready now!
+Model saved!
+Model saved!
+Checkpoint saved.
+[2024-10-25 19:27:06,252] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:27:09,573] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:28:40,360] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:28:43,635] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:29:03,608] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:29:03,610] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=0, lr=[5.924074268766422e-07, 5.924074268766422e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-25 19:29:03,610] [INFO] [timer.py:264:stop] epoch=0/micro_step=200/global_step=100, RunningAvgSamplesPerSec=18.66696592651746, CurrSamplesPerSec=19.246283042032882, MemAllocated=33.12GB, MaxMemAllocated=47.22GB
+[2024-10-25 19:29:06,852] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:29:06,853] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-25 19:29:06,853] [INFO] [timer.py:264:stop] epoch=0/micro_step=200/global_step=100, RunningAvgSamplesPerSec=19.358645620408343, CurrSamplesPerSec=20.0045335178505, MemAllocated=33.12GB, MaxMemAllocated=47.22GB
+[2024-10-25 19:30:37,368] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:30:40,629] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:31:00,782] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:31:04,118] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+Saving checkpoint at step 210 ...
+Saving model to "../outputs/ppo_ti2ti_baseline_1025_with_eval" ...
+Saving 16-bit model...
+[2024-10-25 19:31:14,905] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step105 is about to be saved!
+[2024-10-25 19:31:14,906] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_210.bin, tag: global_step105
+[2024-10-25 19:31:14,907] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_210.bin...
+[2024-10-25 19:31:30,468] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_210.bin.
+[2024-10-25 19:31:30,471] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step105 is ready now!
+Model saved!
+Saving 16-bit model...
+[2024-10-25 19:31:37,840] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step105 is about to be saved!
+[2024-10-25 19:31:37,842] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_210.bin, tag: global_step105
+[2024-10-25 19:31:37,842] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_210.bin...
+[2024-10-25 19:31:59,787] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_210.bin.
+[2024-10-25 19:31:59,790] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step105 is ready now!
+Model saved!
+Model saved!
+Checkpoint saved.
+[2024-10-25 19:33:53,254] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=0, lr=[1.1893092270227724e-07, 1.1893092270227724e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-25 19:33:53,255] [INFO] [timer.py:264:stop] epoch=0/micro_step=220/global_step=110, RunningAvgSamplesPerSec=18.708463115192874, CurrSamplesPerSec=19.308977655910134, MemAllocated=33.13GB, MaxMemAllocated=47.22GB
+[2024-10-25 19:33:56,503] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2024-10-25 19:33:56,504] [INFO] [timer.py:264:stop] epoch=0/micro_step=220/global_step=110, RunningAvgSamplesPerSec=19.393453277752876, CurrSamplesPerSec=19.924871782255472, MemAllocated=33.13GB, MaxMemAllocated=47.22GB
+[2024-10-25 19:34:40,368] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:34:43,678] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:35:03,709] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:35:07,004] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:35:50,234] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:35:53,480] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:36:13,581] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2024-10-25 19:36:16,864] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+Saving model to "../outputs/ppo_ti2ti_baseline_1025_with_eval" ...
+Saving 16-bit model...
+[2024-10-25 19:37:19,241] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step118 is about to be saved!
+[2024-10-25 19:37:19,242] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model.bin, tag: global_step118
+[2024-10-25 19:37:19,243] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model.bin...
+[2024-10-25 19:37:40,063] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model.bin.
+[2024-10-25 19:37:40,065] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step118 is ready now!
+Model saved!
+Saving 16-bit model...
+[2024-10-25 19:37:49,384] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step118 is about to be saved!
+[2024-10-25 19:37:49,385] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model.bin, tag: global_step118
+[2024-10-25 19:37:49,386] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model.bin...
+[2024-10-25 19:38:13,508] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model.bin.
+[2024-10-25 19:38:13,511] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step118 is ready now!
+Model saved!
+Model saved!

wandb/run-20241025_181518-qbvp2oju/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,233 @@

+align-anything==0.0.1.dev0
+torch==2.4.0
+pycparser==2.22
+torchvision==0.19.0
+multiprocess==0.70.16
+braceexpand==0.1.7
+lm-format-enforcer==0.10.6
+Jinja2==3.1.4
+scikit-learn==1.5.2
+interegular==0.3.3
+starlette==0.38.6
+huggingface-hub==0.25.2
+pyairports==2.1.1
+protobuf==3.20.3
+term-image==0.7.2
+python-dateutil==2.9.0.post0
+identify==2.6.1
+tokenizers==0.19.1
+tensorboard-data-server==0.7.2
+numba==0.60.0
+ninja==1.11.1.1
+nvidia-cuda-cupti-cu12==12.1.105
+diskcache==5.6.3
+pycountry==24.6.1
+py-cpuinfo==9.0.0
+scipy==1.14.1
+soxr==0.5.0.post1
+prometheus-fastapi-instrumentator==7.0.0
+align-anything==0.0.1.dev0
+virtualenv==20.26.6
+hjson==3.1.0
+nvidia-cudnn-cu12==9.1.0.70
+termcolor==2.5.0
+grpcio==1.66.2
+wheel==0.44.0
+torchlibrosa==0.1.0
+numpy==1.26.4
+msgpack==1.1.0
+rpds-py==0.20.0
+annotated-types==0.7.0
+pre_commit==4.0.1
+aiohttp==3.10.10
+audioread==3.0.1
+lazy_loader==0.4
+nvidia-cuda-runtime-cu12==12.1.105
+filelock==3.16.1
+timm==0.6.13
+anyio==4.6.0
+pydantic_core==2.23.4
+idna==3.10
+fastapi==0.115.0
+wandb==0.18.3
+packaging==24.1
+yt-dlp==2024.8.6
+matplotlib==3.9.2
+websockets==12.0
+triton==3.0.0
+zipp==3.20.2
+requests==2.32.3
+xxhash==3.5.0
+image-reward==1.5
+pytorch-fid==0.3.0
+imageio-ffmpeg==0.5.1
+args==0.1.0
+llvmlite==0.43.0
+peft==0.13.2
+openai==1.51.2
+httpx==0.27.2
+nvidia-cublas-cu12==12.1.3.1
+pytest-split==0.8.0
+ruff==0.6.9
+sniffio==1.3.1
+yarl==1.15.0
+pandas==2.2.3
+fsspec==2024.6.1
+gguf==0.10.0
+diffusers==0.30.3
+platformdirs==4.3.6
+nvidia-cuda-nvrtc-cu12==12.1.105
+imageio==2.35.1
+Brotli==1.1.0
+bitsandbytes==0.44.1
+hpsv2==1.2.0
+lark==1.2.2
+gradio==5.0.2
+pydantic==2.9.2
+pytz==2024.2
+jsonschema-specifications==2024.10.1
+deepspeed==0.15.2
+cloudpickle==3.1.0
+distro==1.9.0
+aiohappyeyeballs==2.4.3
+Markdown==3.7
+docker-pycreds==0.4.0
+semantic-version==2.10.0
+resampy==0.4.3
+urllib3==2.2.3
+nodeenv==1.9.1
+click==8.1.7
+accelerate==1.0.1
+dill==0.3.8
+setproctitle==1.3.3
+httpcore==1.0.6
+pooch==1.8.2
+importlib_metadata==8.5.0
+cfgv==3.4.0
+einops==0.8.0
+shellingham==1.5.4
+pytest==7.2.0
+python-dotenv==1.0.1
+pydub==0.25.1
+kiwisolver==1.4.7
+aiofiles==23.2.1
+vllm==0.6.2
+Werkzeug==3.0.4
+tensorboard==2.18.0
+joblib==1.4.2
+pycryptodomex==3.21.0
+moviepy==1.0.3
+typing_extensions==4.12.2
+mdurl==0.1.2
+mistral_common==1.4.4
+rich==13.9.2
+aiosignal==1.3.1
+mmsg==0.1.dev20+g585c63a.d20241012
+pillow==10.4.0
+prometheus_client==0.21.0
+nvidia-cusolver-cu12==11.4.5.107
+typer==0.12.5
+pyzmq==26.2.0
+h11==0.14.0
+gitdb==4.0.11
+transformers==4.44.0.dev0
+nvidia-nccl-cu12==2.20.5
+jsonschema==4.23.0
+soundfile==0.12.1
+contourpy==1.3.0
+mutagen==1.47.0
+regex==2024.9.11
+orjson==3.10.7
+fairscale==0.4.13
+partial-json-parser==0.2.1.1.post4
+outlines==0.1.1.dev4+ga2fd35c
+nvidia-curand-cu12==10.3.2.106
+pluggy==1.5.0
+GitPython==3.1.43
+tzdata==2024.2
+uvicorn==0.31.1
+sentencepiece==0.2.0
+decorator==4.4.2
+nvidia-nvjitlink-cu12==12.6.77
+distlib==0.3.9
+uvloop==0.20.0
+networkx==3.4.1
+wcwidth==0.2.13
+opencv-python==4.6.0.66
+six==1.16.0
+httptools==0.6.1
+safetensors==0.4.5
+nvidia-nvtx-cu12==12.1.105
+markdown-it-py==3.0.0
+certifi==2024.8.30
+sentry-sdk==2.16.0
+outlines_core==0.1.0
+threadpoolctl==3.5.0
+nvidia-cufft-cu12==11.0.2.54
+datasets==3.0.1
+cycler==0.12.1
+psutil==6.0.0
+nvidia-cusparse-cu12==12.1.0.106
+shortuuid==1.0.13
+ffmpy==0.4.0
+xformers==0.0.27.post2
+MarkupSafe==2.1.5
+tqdm==4.66.5
+gradio_client==1.4.0
+attrs==24.2.0
+optree==0.13.0
+PyYAML==6.0.2
+clint==0.5.1
+torchaudio==2.4.0
+frechet-audio-distance==0.1.2
+frozenlist==1.4.1
+clip==0.2.0
+multidict==6.1.0
+propcache==0.2.0
+librosa==0.10.2.post1
+webdataset==0.2.100
+ray==2.37.0
+pyparsing==3.1.4
+pyarrow==17.0.0
+tiktoken==0.7.0
+watchfiles==0.24.0
+proglog==0.1.10
+cachetools==5.5.0
+fonttools==4.54.1
+charset-normalizer==3.4.0
+ftfy==6.3.0
+referencing==0.35.1
+mpmath==1.3.0
+msgspec==0.18.6
+nvidia-ml-py==12.535.161
+smmap==5.0.1
+absl-py==2.1.0
+python-multipart==0.0.12
+Pygments==2.18.0
+iniconfig==2.0.0
+sympy==1.13.3
+pip==24.2
+airportsdata==20241001
+tomlkit==0.12.0
+nest-asyncio==1.6.0
+setuptools==75.1.0
+jiter==0.6.1
+cffi==1.17.1
+nvitop==1.3.2
+backports.tarfile==1.2.0
+zipp==3.19.2
+inflect==7.3.1
+autocommand==2.2.2
+importlib_resources==6.4.0
+packaging==24.1
+jaraco.context==5.3.0
+typeguard==4.3.0
+more-itertools==10.3.0
+jaraco.text==3.12.1
+platformdirs==4.2.2
+wheel==0.43.0
+typing_extensions==4.12.2
+importlib_metadata==8.0.0
+tomli==2.0.1
+jaraco.collections==5.1.0
+jaraco.functools==4.0.1

wandb/run-20241025_181518-qbvp2oju/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,106 @@

+{
+  "os":  "Linux-5.4.0-198-generic-x86_64-with-glibc2.31",
+  "python":  "3.11.10",
+  "startedAt":  "2024-10-25T18:15:18.983727Z",
+  "args":  [
+    "--local_rank=0",
+    "--actor_model_name_or_path",
+    "/data/align-anything/hantao/models/0916_ti_to_ti_sft",
+    "--reward_model_name_or_path",
+    "/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400",
+    "--reward_critic_model_name_or_path",
+    "/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400",
+    "--train_datasets",
+    "/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs",
+    "--train_template",
+    "spavl_ti2ti",
+    "--train_data_files",
+    "ti2ti_llf_prompt_only_tokenize.pt",
+    "--ptx_template",
+    "spavl_ti2ti",
+    "--ptx_data_files",
+    "ti2ti_ptx_27k.pt",
+    "--output_dir",
+    "../outputs/ppo_ti2ti_baseline_1025_with_eval",
+    "--save_interval",
+    "30"
+  ],
+  "program":  "-m align_anything.trainers.text_image_to_text_image.ppo",
+  "git":  {
+    "remote":  "https://github.com/PKU-Alignment/align-anything.git",
+    "commit":  "6fde660afc9985323f147930eedf188a5699adc7"
+  },
+  "email":  "[email protected]",
+  "root":  "../outputs/ppo_ti2ti_baseline_1025_with_eval",
+  "host":  "lyg0195",
+  "username":  "align-anything",
+  "executable":  "/home/align-anything/miniconda3/envs/hantao_cham/bin/python",
+  "cpu_count":  64,
+  "cpu_count_logical":  128,
+  "gpu":  "[NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB]",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "938421047296",
+      "used":  "363102883840"
+    }
+  },
+  "memory":  {
+    "total":  "540647575552"
+  },
+  "cpu":  {
+    "count":  64,
+    "countLogical":  128
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "12.4"
+}

wandb/run-20241025_181518-qbvp2oju/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"train/reward_critic_lr":5e-06,"train/reward_with_kl_penalty":-0.1957259476184845,"_wandb":{"runtime":4974},"_timestamp":1.729885027561649e+09,"train/reward_value":-0.89080810546875,"train/mean_generated_length":1,"train/actor_lr":0,"_step":236,"train/actor_loss":-0.6950821280479431,"train/max_generated_length":1,"train/kl_divergence":4.3175482749938965,"_runtime":4974.609715617,"train/reward_advantage":0.6950821280479431,"train/reward_return":-0.1957259476184845,"train/reward":-0.109375,"train/step":236,"train/reward_critic_loss":0.5039339065551758}

wandb/run-20241025_181518-qbvp2oju/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,20 @@

+{"time":"2024-10-25T18:15:18.987617848Z","level":"INFO","msg":"using version","core version":"0.18.3"}
+{"time":"2024-10-25T18:15:18.987649473Z","level":"INFO","msg":"created symlink","path":"../outputs/ppo_ti2ti_baseline_1025_with_eval/wandb/run-20241025_181518-qbvp2oju/logs/debug-core.log"}
+{"time":"2024-10-25T18:15:18.991400712Z","level":"ERROR","msg":"dialing: google: could not find default credentials. See https://cloud.google.com/docs/authentication/external/set-up-adc for more information"}
+{"time":"2024-10-25T18:15:19.015335603Z","level":"INFO","msg":"created new stream","id":"qbvp2oju"}
+{"time":"2024-10-25T18:15:19.015397376Z","level":"INFO","msg":"stream: started","id":"qbvp2oju"}
+{"time":"2024-10-25T18:15:19.015408377Z","level":"INFO","msg":"handler: started","stream_id":{"value":"qbvp2oju"}}
+{"time":"2024-10-25T18:15:19.015432033Z","level":"INFO","msg":"sender: started","stream_id":{"value":"qbvp2oju"}}
+{"time":"2024-10-25T18:15:19.015437112Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"qbvp2oju"}}
+{"time":"2024-10-25T18:15:20.634593869Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
+{"time":"2024-10-25T18:15:20.637814914Z","level":"INFO","msg":"Starting system monitor"}
+{"time":"2024-10-25T19:38:13.593466266Z","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2024-10-25T19:38:13.627014655Z","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2024-10-25T19:38:14.559855674Z","level":"WARN","msg":"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"}
+{"time":"2024-10-25T19:38:14.559906183Z","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
+{"time":"2024-10-25T19:38:15.545457735Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2024-10-25T19:38:17.12240115Z","level":"INFO","msg":"stream: closing","id":"qbvp2oju"}
+{"time":"2024-10-25T19:38:17.12243525Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"qbvp2oju"}}
+{"time":"2024-10-25T19:38:17.122460489Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"qbvp2oju"}}
+{"time":"2024-10-25T19:38:17.122575437Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"qbvp2oju"}}
+{"time":"2024-10-25T19:38:17.124870943Z","level":"INFO","msg":"stream: closed","id":"qbvp2oju"}

wandb/run-20241025_181518-qbvp2oju/logs/debug.log ADDED Viewed

	@@ -0,0 +1,33 @@

+2024-10-25 18:15:18,977 INFO    MainThread:937440 [wandb_setup.py:_flush():79] Current SDK version is 0.18.3
+2024-10-25 18:15:18,977 INFO    MainThread:937440 [wandb_setup.py:_flush():79] Configure stats pid to 937440
+2024-10-25 18:15:18,977 INFO    MainThread:937440 [wandb_setup.py:_flush():79] Loading settings from /home/align-anything/.config/wandb/settings
+2024-10-25 18:15:18,977 INFO    MainThread:937440 [wandb_setup.py:_flush():79] Loading settings from /data/align-anything/hantao/align-anything/scripts/wandb/settings
+2024-10-25 18:15:18,977 INFO    MainThread:937440 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'api_key': '***REDACTED***', 'mode': 'online'}
+2024-10-25 18:15:18,977 INFO    MainThread:937440 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
+2024-10-25 18:15:18,977 WARNING MainThread:937440 [wandb_setup.py:_flush():79] Could not find program at -m align_anything.trainers.text_image_to_text_image.ppo
+2024-10-25 18:15:18,977 INFO    MainThread:937440 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m align_anything.trainers.text_image_to_text_image.ppo'}
+2024-10-25 18:15:18,977 INFO    MainThread:937440 [wandb_setup.py:_flush():79] Applying login settings: {}
+2024-10-25 18:15:18,977 INFO    MainThread:937440 [wandb_init.py:_log_setup():532] Logging user logs to ../outputs/ppo_ti2ti_baseline_1025_with_eval/wandb/run-20241025_181518-qbvp2oju/logs/debug.log
+2024-10-25 18:15:18,978 INFO    MainThread:937440 [wandb_init.py:_log_setup():533] Logging internal logs to ../outputs/ppo_ti2ti_baseline_1025_with_eval/wandb/run-20241025_181518-qbvp2oju/logs/debug-internal.log
+2024-10-25 18:15:18,978 INFO    MainThread:937440 [wandb_init.py:init():617] calling init triggers
+2024-10-25 18:15:18,978 INFO    MainThread:937440 [wandb_init.py:init():624] wandb.init called with sweep_config: {}
+config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_prompt_batch_size': 8, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 2, 'actor_gradient_checkpointing': True, 'critic_gradient_checkpointing': True, 'actor_lr': 1e-05, 'actor_lr_scheduler_type': 'cosine', 'actor_lr_warmup_ratio': 0.03, 'actor_weight_decay': 0.01, 'critic_lr': 5e-06, 'critic_lr_scheduler_type': 'constant', 'critic_lr_warmup_ratio': 0.03, 'critic_weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'kl_coeff': 0.02, 'clip_range_ratio': 0.2, 'clip_range_score': 50.0, 'clip_range_value': 5.0, 'ptx_coeff': 16.0, 'gamma': 1.0, 'gae_lambda': 0.95, 'normalize_reward': False, 'update_iters': 1, 'freeze_mm_proj': True, 'freeze_vision_tower': False, 'freeze_language_model': True}, 'data_cfgs': {'train_datasets': '/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs', 'train_template': 'spavl_ti2ti', 'train_size': 5000, 'train_split': None, 'train_subset': None, 'train_data_files': 'ti2ti_llf_prompt_only_tokenize.pt', 'train_optional_args': [], 'eval_datasets': None, 'eval_template': None, 'eval_size': None, 'eval_split': None, 'eval_subset': None, 'eval_data_files': None, 'eval_optional_args': [], 'ptx_datasets': None, 'ptx_template': 'spavl_ti2ti', 'ptx_size': None, 'ptx_subset': None, 'ptx_split': None, 'ptx_data_files': 'ti2ti_ptx_27k.pt', 'ptx_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'ppo', 'output_dir': '../outputs/ppo_ti2ti_baseline_1025_with_eval', 'cache_dir': None, 'save_interval': 30.0}, 'model_cfgs': {'actor_model_name_or_path': '/data/align-anything/hantao/models/0916_ti_to_ti_sft', 'reward_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400', 'reward_critic_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400', 'trust_remote_code': True, 'model_max_length': 2048, 'temperature': 1.0, 'top_p': 1.0, 'repetition_penalty': 1.0}, 'lora_cfgs': {'use_lora': False, 'task_type': 'TaskType.CAUSAL_LM', 'inference_mode': False, 'r': 16, 'lora_alpha': 16, 'lora_dropout': 0.1, 'target_modules': ['q_proj', 'v_proj'], 'save_full_model': True}, 'bnb_cfgs': {'use_bnb': False, 'load_in_4bit': True, 'load_in_8bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'float16'}, 'special_tokens': None}
+2024-10-25 18:15:18,978 INFO    MainThread:937440 [wandb_init.py:init():667] starting backend
+2024-10-25 18:15:18,978 INFO    MainThread:937440 [wandb_init.py:init():671] sending inform_init request
+2024-10-25 18:15:18,982 INFO    MainThread:937440 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2024-10-25 18:15:18,983 INFO    MainThread:937440 [wandb_init.py:init():684] backend started and connected
+2024-10-25 18:15:18,986 INFO    MainThread:937440 [wandb_init.py:init():779] updated telemetry
+2024-10-25 18:15:18,996 INFO    MainThread:937440 [wandb_init.py:init():812] communicating run to backend with 90.0 second timeout
+2024-10-25 18:15:20,628 INFO    MainThread:937440 [wandb_init.py:init():863] starting run threads in backend
+2024-10-25 18:15:20,774 INFO    MainThread:937440 [wandb_run.py:_console_start():2465] atexit reg
+2024-10-25 18:15:20,774 INFO    MainThread:937440 [wandb_run.py:_redirect():2313] redirect: wrap_raw
+2024-10-25 18:15:20,774 INFO    MainThread:937440 [wandb_run.py:_redirect():2378] Wrapping output streams.
+2024-10-25 18:15:20,774 INFO    MainThread:937440 [wandb_run.py:_redirect():2403] Redirects installed.
+2024-10-25 18:15:20,776 INFO    MainThread:937440 [wandb_init.py:init():907] run started, returning control to user process
+2024-10-25 19:38:13,587 INFO    MainThread:937440 [wandb_run.py:_finish():2164] finishing run htlou/align-anything/qbvp2oju
+2024-10-25 19:38:13,590 INFO    MainThread:937440 [wandb_run.py:_atexit_cleanup():2428] got exitcode: 0
+2024-10-25 19:38:13,591 INFO    MainThread:937440 [wandb_run.py:_restore():2410] restore
+2024-10-25 19:38:13,592 INFO    MainThread:937440 [wandb_run.py:_restore():2416] restore done
+2024-10-25 19:38:17,104 INFO    MainThread:937440 [wandb_run.py:_footer_history_summary_info():4049] rendering history
+2024-10-25 19:38:17,107 INFO    MainThread:937440 [wandb_run.py:_footer_history_summary_info():4081] rendering summary
+2024-10-25 19:38:17,119 INFO    MainThread:937440 [wandb_run.py:_footer_sync_info():4008] logging synced files

wandb/run-20241025_181518-qbvp2oju/run-qbvp2oju.wandb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:76ef46f86b21cc7b1a13903cb2d0aa221a447ddcdb4aff5e95e115b373ce98a4
+size 4642995