Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- arguments.yaml +94 -0
- config.json +0 -0
- environ.txt +56 -0
- preprocessor_config.json +28 -0
- processor_config.json +5 -0
- pytorch_model.bin +3 -0
- script.sh +50 -0
- special_tokens_map.json +37 -0
- tokenizer.json +0 -0
- tokenizer_config.json +0 -0
- wandb/debug-internal.log +20 -0
- wandb/debug.log +33 -0
- wandb/run-20241025_180620-eoegk43l/files/output.log +0 -0
- wandb/run-20241025_180620-eoegk43l/files/requirements.txt +233 -0
- wandb/run-20241025_180620-eoegk43l/files/wandb-metadata.json +106 -0
- wandb/run-20241025_180620-eoegk43l/logs/debug-internal.log +10 -0
- wandb/run-20241025_180620-eoegk43l/logs/debug.log +26 -0
- wandb/run-20241025_180620-eoegk43l/run-eoegk43l.wandb +0 -0
- wandb/run-20241025_181518-qbvp2oju/files/config.yaml +143 -0
- wandb/run-20241025_181518-qbvp2oju/files/output.log +307 -0
- wandb/run-20241025_181518-qbvp2oju/files/requirements.txt +233 -0
- wandb/run-20241025_181518-qbvp2oju/files/wandb-metadata.json +106 -0
- wandb/run-20241025_181518-qbvp2oju/files/wandb-summary.json +1 -0
- wandb/run-20241025_181518-qbvp2oju/logs/debug-internal.log +20 -0
- wandb/run-20241025_181518-qbvp2oju/logs/debug.log +33 -0
- wandb/run-20241025_181518-qbvp2oju/run-qbvp2oju.wandb +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
wandb/run-20241025_181518-qbvp2oju/run-qbvp2oju.wandb filter=lfs diff=lfs merge=lfs -text
|
arguments.yaml
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
bnb_cfgs:
|
2 |
+
bnb_4bit_compute_dtype: float16
|
3 |
+
bnb_4bit_quant_type: nf4
|
4 |
+
bnb_4bit_use_double_quant: true
|
5 |
+
load_in_4bit: true
|
6 |
+
load_in_8bit: false
|
7 |
+
use_bnb: false
|
8 |
+
data_cfgs:
|
9 |
+
eval_data_files: null
|
10 |
+
eval_datasets: null
|
11 |
+
eval_optional_args: []
|
12 |
+
eval_size: null
|
13 |
+
eval_split: null
|
14 |
+
eval_subset: null
|
15 |
+
eval_template: null
|
16 |
+
ptx_data_files: ti2ti_ptx_27k.pt
|
17 |
+
ptx_datasets: null
|
18 |
+
ptx_optional_args: []
|
19 |
+
ptx_size: null
|
20 |
+
ptx_split: null
|
21 |
+
ptx_subset: null
|
22 |
+
ptx_template: spavl_ti2ti
|
23 |
+
train_data_files: ti2ti_llf_prompt_only_tokenize.pt
|
24 |
+
train_datasets: /data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs
|
25 |
+
train_optional_args: []
|
26 |
+
train_size: 5000
|
27 |
+
train_split: null
|
28 |
+
train_subset: null
|
29 |
+
train_template: spavl_ti2ti
|
30 |
+
logger_cfgs:
|
31 |
+
cache_dir: null
|
32 |
+
log_project: align-anything
|
33 |
+
log_run_name: ppo
|
34 |
+
log_type: wandb
|
35 |
+
output_dir: ../outputs/ppo_ti2ti_baseline_1025_with_eval
|
36 |
+
save_interval: 30.0
|
37 |
+
lora_cfgs:
|
38 |
+
inference_mode: false
|
39 |
+
lora_alpha: 16
|
40 |
+
lora_dropout: 0.1
|
41 |
+
r: 16
|
42 |
+
save_full_model: true
|
43 |
+
target_modules:
|
44 |
+
- q_proj
|
45 |
+
- v_proj
|
46 |
+
task_type: TaskType.CAUSAL_LM
|
47 |
+
use_lora: false
|
48 |
+
model_cfgs:
|
49 |
+
actor_model_name_or_path: /data/align-anything/hantao/models/0916_ti_to_ti_sft
|
50 |
+
model_max_length: 2048
|
51 |
+
repetition_penalty: 1.0
|
52 |
+
reward_critic_model_name_or_path: /data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400
|
53 |
+
reward_model_name_or_path: /data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400
|
54 |
+
temperature: 1.0
|
55 |
+
top_p: 1.0
|
56 |
+
trust_remote_code: true
|
57 |
+
special_tokens: null
|
58 |
+
train_cfgs:
|
59 |
+
actor_gradient_checkpointing: true
|
60 |
+
actor_lr: 1.0e-05
|
61 |
+
actor_lr_scheduler_type: cosine
|
62 |
+
actor_lr_warmup_ratio: 0.03
|
63 |
+
actor_weight_decay: 0.01
|
64 |
+
adam_betas:
|
65 |
+
- 0.9
|
66 |
+
- 0.95
|
67 |
+
bf16: true
|
68 |
+
clip_range_ratio: 0.2
|
69 |
+
clip_range_score: 50.0
|
70 |
+
clip_range_value: 5.0
|
71 |
+
critic_gradient_checkpointing: true
|
72 |
+
critic_lr: 5.0e-06
|
73 |
+
critic_lr_scheduler_type: constant
|
74 |
+
critic_lr_warmup_ratio: 0.03
|
75 |
+
critic_weight_decay: 0.0
|
76 |
+
ds_cfgs: ds_z3_config.json
|
77 |
+
epochs: 3
|
78 |
+
eval_interval: 10
|
79 |
+
eval_strategy: epoch
|
80 |
+
fp16: false
|
81 |
+
freeze_language_model: true
|
82 |
+
freeze_mm_proj: true
|
83 |
+
freeze_vision_tower: false
|
84 |
+
gae_lambda: 0.95
|
85 |
+
gamma: 1.0
|
86 |
+
gradient_accumulation_steps: 2
|
87 |
+
kl_coeff: 0.02
|
88 |
+
normalize_reward: false
|
89 |
+
per_device_eval_batch_size: 8
|
90 |
+
per_device_prompt_batch_size: 8
|
91 |
+
per_device_train_batch_size: 8
|
92 |
+
ptx_coeff: 16.0
|
93 |
+
seed: 42
|
94 |
+
update_iters: 1
|
config.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
environ.txt
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
CONDA_DEFAULT_ENV=hantao_cham
|
2 |
+
CONDA_EXE=/home/align-anything/miniconda3/bin/conda
|
3 |
+
CONDA_PREFIX=/home/align-anything/miniconda3/envs/hantao_cham
|
4 |
+
CONDA_PREFIX_1=/home/align-anything/miniconda3
|
5 |
+
CONDA_PROMPT_MODIFIER=(hantao_cham)
|
6 |
+
CONDA_PYTHON_EXE=/home/align-anything/miniconda3/bin/python
|
7 |
+
CONDA_SHLVL=2
|
8 |
+
CRASHDIR=/etc/ShellCrash
|
9 |
+
CROSS_RANK=0
|
10 |
+
CROSS_SIZE=1
|
11 |
+
CUDA_MODULE_LOADING=LAZY
|
12 |
+
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
13 |
+
DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/2000/bus
|
14 |
+
HOME=/home/align-anything
|
15 |
+
LANG=en_US.UTF-8
|
16 |
+
LD_LIBRARY_PATH=/home/align-anything/miniconda3/envs/hantao_cham/lib/python3.11/site-packages/cv2/../../lib64:
|
17 |
+
LESSCLOSE=/usr/bin/lesspipe %s %s
|
18 |
+
LESSOPEN=| /usr/bin/lesspipe %s
|
19 |
+
LOCAL_RANK=0
|
20 |
+
LOCAL_SIZE=8
|
21 |
+
LOGLEVEL=WARNING
|
22 |
+
LOGNAME=align-anything
|
23 |
+
LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:
|
24 |
+
MASTER_ADDR=127.0.0.1
|
25 |
+
MASTER_PORT=63303
|
26 |
+
MOTD_SHOWN=pam
|
27 |
+
OLDPWD=/data/align-anything/hantao/align-anything/projects/text_image_to_text_image
|
28 |
+
PATH=/home/align-anything/miniconda3/envs/hantao_cham/bin:/home/align-anything/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
|
29 |
+
PWD=/data/align-anything/hantao/align-anything/scripts
|
30 |
+
PYGAME_HIDE_SUPPORT_PROMPT=1
|
31 |
+
PYTHONHASHSEED=42
|
32 |
+
PYTHONPATH=/data/align-anything/hantao/align-anything
|
33 |
+
QT_QPA_FONTDIR=/home/align-anything/miniconda3/envs/hantao_cham/lib/python3.11/site-packages/cv2/qt/fonts
|
34 |
+
QT_QPA_PLATFORM_PLUGIN_PATH=/home/align-anything/miniconda3/envs/hantao_cham/lib/python3.11/site-packages/cv2/qt/plugins
|
35 |
+
RANK=0
|
36 |
+
SHELL=/bin/bash
|
37 |
+
SHLVL=3
|
38 |
+
SSH_CLIENT=111.205.230.212 28724 30500
|
39 |
+
SSH_CONNECTION=111.205.230.212 62683 10.10.212.195 30500
|
40 |
+
SSH_TTY=/dev/pts/2
|
41 |
+
TERM=screen
|
42 |
+
TMUX=/tmp/tmux-2000/default,90929,6
|
43 |
+
TMUX_PANE=%6
|
44 |
+
USER=align-anything
|
45 |
+
WANDB_API_KEY=7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33
|
46 |
+
WANDB_MODE=online
|
47 |
+
WANDB_SERVICE=2-937440-tcp-localhost-44607
|
48 |
+
WORLD_SIZE=8
|
49 |
+
XDG_DATA_DIRS=/usr/local/share:/usr/share:/var/lib/snapd/desktop
|
50 |
+
XDG_RUNTIME_DIR=/run/user/2000
|
51 |
+
XDG_SESSION_CLASS=user
|
52 |
+
XDG_SESSION_ID=4
|
53 |
+
XDG_SESSION_TYPE=tty
|
54 |
+
_=/home/align-anything/miniconda3/envs/hantao_cham/bin/deepspeed
|
55 |
+
_CE_CONDA=
|
56 |
+
_CE_M=
|
preprocessor_config.json
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"crop_size": {
|
3 |
+
"height": 512,
|
4 |
+
"width": 512
|
5 |
+
},
|
6 |
+
"do_center_crop": true,
|
7 |
+
"do_convert_rgb": true,
|
8 |
+
"do_normalize": true,
|
9 |
+
"do_rescale": true,
|
10 |
+
"do_resize": true,
|
11 |
+
"image_mean": [
|
12 |
+
1.0,
|
13 |
+
1.0,
|
14 |
+
1.0
|
15 |
+
],
|
16 |
+
"image_processor_type": "ChameleonImageProcessor",
|
17 |
+
"image_std": [
|
18 |
+
1.0,
|
19 |
+
1.0,
|
20 |
+
1.0
|
21 |
+
],
|
22 |
+
"processor_class": "ChameleonProcessor",
|
23 |
+
"resample": 1,
|
24 |
+
"rescale_factor": 0.0078,
|
25 |
+
"size": {
|
26 |
+
"shortest_edge": 512
|
27 |
+
}
|
28 |
+
}
|
processor_config.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"image_seq_length": 1024,
|
3 |
+
"image_token": "<image>",
|
4 |
+
"processor_class": "ChameleonProcessor"
|
5 |
+
}
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4c201d7f9317f729765675c0940a46cd4b1675dd9ba6d5b9b5da3cdafb564faa
|
3 |
+
size 14165009930
|
script.sh
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env bash
|
2 |
+
#
|
3 |
+
# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
# ==============================================================================
|
17 |
+
|
18 |
+
# Initialize variables
|
19 |
+
# For wandb online logging
|
20 |
+
export WANDB_API_KEY="7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33"
|
21 |
+
# Source the setup script
|
22 |
+
# source ./setup.sh
|
23 |
+
|
24 |
+
export WANDB_MODE=online
|
25 |
+
|
26 |
+
ACTOR_MODEL_NAME_OR_PATH="/data/align-anything/hantao/models/0916_ti_to_ti_sft"
|
27 |
+
CRITIC_MODEL_NAME_OR_PATH="/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400"
|
28 |
+
REWARD_MODEL_NAME_OR_PATH="/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400"
|
29 |
+
TRAIN_DATASETS="/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs"
|
30 |
+
PTX_DATASETS="/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs"
|
31 |
+
OUTPUT_DIR="../outputs/ppo_ti2ti_baseline_1025_with_eval"
|
32 |
+
|
33 |
+
# Source the setup script
|
34 |
+
source ./setup.sh
|
35 |
+
|
36 |
+
# Execute deepspeed command
|
37 |
+
deepspeed \
|
38 |
+
--master_port ${MASTER_PORT} \
|
39 |
+
--module align_anything.trainers.text_image_to_text_image.ppo \
|
40 |
+
--actor_model_name_or_path ${ACTOR_MODEL_NAME_OR_PATH} \
|
41 |
+
--reward_model_name_or_path ${REWARD_MODEL_NAME_OR_PATH} \
|
42 |
+
--reward_critic_model_name_or_path ${CRITIC_MODEL_NAME_OR_PATH} \
|
43 |
+
--train_datasets ${TRAIN_DATASETS} \
|
44 |
+
--train_template spavl_ti2ti \
|
45 |
+
--train_data_files ti2ti_llf_prompt_only_tokenize.pt \
|
46 |
+
--ptx_template spavl_ti2ti \
|
47 |
+
--ptx_data_files ti2ti_ptx_27k.pt \
|
48 |
+
--output_dir ${OUTPUT_DIR} \
|
49 |
+
--save_interval 30
|
50 |
+
|
special_tokens_map.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "</s>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "<pad>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"sep_token": {
|
24 |
+
"content": "<reserved08706>",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"unk_token": {
|
31 |
+
"content": "<unk>",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
}
|
37 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
wandb/debug-internal.log
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2024-10-25T18:15:18.987617848Z","level":"INFO","msg":"using version","core version":"0.18.3"}
|
2 |
+
{"time":"2024-10-25T18:15:18.987649473Z","level":"INFO","msg":"created symlink","path":"../outputs/ppo_ti2ti_baseline_1025_with_eval/wandb/run-20241025_181518-qbvp2oju/logs/debug-core.log"}
|
3 |
+
{"time":"2024-10-25T18:15:18.991400712Z","level":"ERROR","msg":"dialing: google: could not find default credentials. See https://cloud.google.com/docs/authentication/external/set-up-adc for more information"}
|
4 |
+
{"time":"2024-10-25T18:15:19.015335603Z","level":"INFO","msg":"created new stream","id":"qbvp2oju"}
|
5 |
+
{"time":"2024-10-25T18:15:19.015397376Z","level":"INFO","msg":"stream: started","id":"qbvp2oju"}
|
6 |
+
{"time":"2024-10-25T18:15:19.015408377Z","level":"INFO","msg":"handler: started","stream_id":{"value":"qbvp2oju"}}
|
7 |
+
{"time":"2024-10-25T18:15:19.015432033Z","level":"INFO","msg":"sender: started","stream_id":{"value":"qbvp2oju"}}
|
8 |
+
{"time":"2024-10-25T18:15:19.015437112Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"qbvp2oju"}}
|
9 |
+
{"time":"2024-10-25T18:15:20.634593869Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
|
10 |
+
{"time":"2024-10-25T18:15:20.637814914Z","level":"INFO","msg":"Starting system monitor"}
|
11 |
+
{"time":"2024-10-25T19:38:13.593466266Z","level":"INFO","msg":"Stopping system monitor"}
|
12 |
+
{"time":"2024-10-25T19:38:13.627014655Z","level":"INFO","msg":"Stopped system monitor"}
|
13 |
+
{"time":"2024-10-25T19:38:14.559855674Z","level":"WARN","msg":"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"}
|
14 |
+
{"time":"2024-10-25T19:38:14.559906183Z","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
|
15 |
+
{"time":"2024-10-25T19:38:15.545457735Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
16 |
+
{"time":"2024-10-25T19:38:17.12240115Z","level":"INFO","msg":"stream: closing","id":"qbvp2oju"}
|
17 |
+
{"time":"2024-10-25T19:38:17.12243525Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"qbvp2oju"}}
|
18 |
+
{"time":"2024-10-25T19:38:17.122460489Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"qbvp2oju"}}
|
19 |
+
{"time":"2024-10-25T19:38:17.122575437Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"qbvp2oju"}}
|
20 |
+
{"time":"2024-10-25T19:38:17.124870943Z","level":"INFO","msg":"stream: closed","id":"qbvp2oju"}
|
wandb/debug.log
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Current SDK version is 0.18.3
|
2 |
+
2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Configure stats pid to 937440
|
3 |
+
2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Loading settings from /home/align-anything/.config/wandb/settings
|
4 |
+
2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Loading settings from /data/align-anything/hantao/align-anything/scripts/wandb/settings
|
5 |
+
2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'api_key': '***REDACTED***', 'mode': 'online'}
|
6 |
+
2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
|
7 |
+
2024-10-25 18:15:18,977 WARNING MainThread:937440 [wandb_setup.py:_flush():79] Could not find program at -m align_anything.trainers.text_image_to_text_image.ppo
|
8 |
+
2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m align_anything.trainers.text_image_to_text_image.ppo'}
|
9 |
+
2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Applying login settings: {}
|
10 |
+
2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_init.py:_log_setup():532] Logging user logs to ../outputs/ppo_ti2ti_baseline_1025_with_eval/wandb/run-20241025_181518-qbvp2oju/logs/debug.log
|
11 |
+
2024-10-25 18:15:18,978 INFO MainThread:937440 [wandb_init.py:_log_setup():533] Logging internal logs to ../outputs/ppo_ti2ti_baseline_1025_with_eval/wandb/run-20241025_181518-qbvp2oju/logs/debug-internal.log
|
12 |
+
2024-10-25 18:15:18,978 INFO MainThread:937440 [wandb_init.py:init():617] calling init triggers
|
13 |
+
2024-10-25 18:15:18,978 INFO MainThread:937440 [wandb_init.py:init():624] wandb.init called with sweep_config: {}
|
14 |
+
config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_prompt_batch_size': 8, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 2, 'actor_gradient_checkpointing': True, 'critic_gradient_checkpointing': True, 'actor_lr': 1e-05, 'actor_lr_scheduler_type': 'cosine', 'actor_lr_warmup_ratio': 0.03, 'actor_weight_decay': 0.01, 'critic_lr': 5e-06, 'critic_lr_scheduler_type': 'constant', 'critic_lr_warmup_ratio': 0.03, 'critic_weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'kl_coeff': 0.02, 'clip_range_ratio': 0.2, 'clip_range_score': 50.0, 'clip_range_value': 5.0, 'ptx_coeff': 16.0, 'gamma': 1.0, 'gae_lambda': 0.95, 'normalize_reward': False, 'update_iters': 1, 'freeze_mm_proj': True, 'freeze_vision_tower': False, 'freeze_language_model': True}, 'data_cfgs': {'train_datasets': '/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs', 'train_template': 'spavl_ti2ti', 'train_size': 5000, 'train_split': None, 'train_subset': None, 'train_data_files': 'ti2ti_llf_prompt_only_tokenize.pt', 'train_optional_args': [], 'eval_datasets': None, 'eval_template': None, 'eval_size': None, 'eval_split': None, 'eval_subset': None, 'eval_data_files': None, 'eval_optional_args': [], 'ptx_datasets': None, 'ptx_template': 'spavl_ti2ti', 'ptx_size': None, 'ptx_subset': None, 'ptx_split': None, 'ptx_data_files': 'ti2ti_ptx_27k.pt', 'ptx_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'ppo', 'output_dir': '../outputs/ppo_ti2ti_baseline_1025_with_eval', 'cache_dir': None, 'save_interval': 30.0}, 'model_cfgs': {'actor_model_name_or_path': '/data/align-anything/hantao/models/0916_ti_to_ti_sft', 'reward_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400', 'reward_critic_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400', 'trust_remote_code': True, 'model_max_length': 2048, 'temperature': 1.0, 'top_p': 1.0, 'repetition_penalty': 1.0}, 'lora_cfgs': {'use_lora': False, 'task_type': 'TaskType.CAUSAL_LM', 'inference_mode': False, 'r': 16, 'lora_alpha': 16, 'lora_dropout': 0.1, 'target_modules': ['q_proj', 'v_proj'], 'save_full_model': True}, 'bnb_cfgs': {'use_bnb': False, 'load_in_4bit': True, 'load_in_8bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'float16'}, 'special_tokens': None}
|
15 |
+
2024-10-25 18:15:18,978 INFO MainThread:937440 [wandb_init.py:init():667] starting backend
|
16 |
+
2024-10-25 18:15:18,978 INFO MainThread:937440 [wandb_init.py:init():671] sending inform_init request
|
17 |
+
2024-10-25 18:15:18,982 INFO MainThread:937440 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
18 |
+
2024-10-25 18:15:18,983 INFO MainThread:937440 [wandb_init.py:init():684] backend started and connected
|
19 |
+
2024-10-25 18:15:18,986 INFO MainThread:937440 [wandb_init.py:init():779] updated telemetry
|
20 |
+
2024-10-25 18:15:18,996 INFO MainThread:937440 [wandb_init.py:init():812] communicating run to backend with 90.0 second timeout
|
21 |
+
2024-10-25 18:15:20,628 INFO MainThread:937440 [wandb_init.py:init():863] starting run threads in backend
|
22 |
+
2024-10-25 18:15:20,774 INFO MainThread:937440 [wandb_run.py:_console_start():2465] atexit reg
|
23 |
+
2024-10-25 18:15:20,774 INFO MainThread:937440 [wandb_run.py:_redirect():2313] redirect: wrap_raw
|
24 |
+
2024-10-25 18:15:20,774 INFO MainThread:937440 [wandb_run.py:_redirect():2378] Wrapping output streams.
|
25 |
+
2024-10-25 18:15:20,774 INFO MainThread:937440 [wandb_run.py:_redirect():2403] Redirects installed.
|
26 |
+
2024-10-25 18:15:20,776 INFO MainThread:937440 [wandb_init.py:init():907] run started, returning control to user process
|
27 |
+
2024-10-25 19:38:13,587 INFO MainThread:937440 [wandb_run.py:_finish():2164] finishing run htlou/align-anything/qbvp2oju
|
28 |
+
2024-10-25 19:38:13,590 INFO MainThread:937440 [wandb_run.py:_atexit_cleanup():2428] got exitcode: 0
|
29 |
+
2024-10-25 19:38:13,591 INFO MainThread:937440 [wandb_run.py:_restore():2410] restore
|
30 |
+
2024-10-25 19:38:13,592 INFO MainThread:937440 [wandb_run.py:_restore():2416] restore done
|
31 |
+
2024-10-25 19:38:17,104 INFO MainThread:937440 [wandb_run.py:_footer_history_summary_info():4049] rendering history
|
32 |
+
2024-10-25 19:38:17,107 INFO MainThread:937440 [wandb_run.py:_footer_history_summary_info():4081] rendering summary
|
33 |
+
2024-10-25 19:38:17,119 INFO MainThread:937440 [wandb_run.py:_footer_sync_info():4008] logging synced files
|
wandb/run-20241025_180620-eoegk43l/files/output.log
ADDED
File without changes
|
wandb/run-20241025_180620-eoegk43l/files/requirements.txt
ADDED
@@ -0,0 +1,233 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
align-anything==0.0.1.dev0
|
2 |
+
torch==2.4.0
|
3 |
+
pycparser==2.22
|
4 |
+
torchvision==0.19.0
|
5 |
+
multiprocess==0.70.16
|
6 |
+
braceexpand==0.1.7
|
7 |
+
lm-format-enforcer==0.10.6
|
8 |
+
Jinja2==3.1.4
|
9 |
+
scikit-learn==1.5.2
|
10 |
+
interegular==0.3.3
|
11 |
+
starlette==0.38.6
|
12 |
+
huggingface-hub==0.25.2
|
13 |
+
pyairports==2.1.1
|
14 |
+
protobuf==3.20.3
|
15 |
+
term-image==0.7.2
|
16 |
+
python-dateutil==2.9.0.post0
|
17 |
+
identify==2.6.1
|
18 |
+
tokenizers==0.19.1
|
19 |
+
tensorboard-data-server==0.7.2
|
20 |
+
numba==0.60.0
|
21 |
+
ninja==1.11.1.1
|
22 |
+
nvidia-cuda-cupti-cu12==12.1.105
|
23 |
+
diskcache==5.6.3
|
24 |
+
pycountry==24.6.1
|
25 |
+
py-cpuinfo==9.0.0
|
26 |
+
scipy==1.14.1
|
27 |
+
soxr==0.5.0.post1
|
28 |
+
prometheus-fastapi-instrumentator==7.0.0
|
29 |
+
align-anything==0.0.1.dev0
|
30 |
+
virtualenv==20.26.6
|
31 |
+
hjson==3.1.0
|
32 |
+
nvidia-cudnn-cu12==9.1.0.70
|
33 |
+
termcolor==2.5.0
|
34 |
+
grpcio==1.66.2
|
35 |
+
wheel==0.44.0
|
36 |
+
torchlibrosa==0.1.0
|
37 |
+
numpy==1.26.4
|
38 |
+
msgpack==1.1.0
|
39 |
+
rpds-py==0.20.0
|
40 |
+
annotated-types==0.7.0
|
41 |
+
pre_commit==4.0.1
|
42 |
+
aiohttp==3.10.10
|
43 |
+
audioread==3.0.1
|
44 |
+
lazy_loader==0.4
|
45 |
+
nvidia-cuda-runtime-cu12==12.1.105
|
46 |
+
filelock==3.16.1
|
47 |
+
timm==0.6.13
|
48 |
+
anyio==4.6.0
|
49 |
+
pydantic_core==2.23.4
|
50 |
+
idna==3.10
|
51 |
+
fastapi==0.115.0
|
52 |
+
wandb==0.18.3
|
53 |
+
packaging==24.1
|
54 |
+
yt-dlp==2024.8.6
|
55 |
+
matplotlib==3.9.2
|
56 |
+
websockets==12.0
|
57 |
+
triton==3.0.0
|
58 |
+
zipp==3.20.2
|
59 |
+
requests==2.32.3
|
60 |
+
xxhash==3.5.0
|
61 |
+
image-reward==1.5
|
62 |
+
pytorch-fid==0.3.0
|
63 |
+
imageio-ffmpeg==0.5.1
|
64 |
+
args==0.1.0
|
65 |
+
llvmlite==0.43.0
|
66 |
+
peft==0.13.2
|
67 |
+
openai==1.51.2
|
68 |
+
httpx==0.27.2
|
69 |
+
nvidia-cublas-cu12==12.1.3.1
|
70 |
+
pytest-split==0.8.0
|
71 |
+
ruff==0.6.9
|
72 |
+
sniffio==1.3.1
|
73 |
+
yarl==1.15.0
|
74 |
+
pandas==2.2.3
|
75 |
+
fsspec==2024.6.1
|
76 |
+
gguf==0.10.0
|
77 |
+
diffusers==0.30.3
|
78 |
+
platformdirs==4.3.6
|
79 |
+
nvidia-cuda-nvrtc-cu12==12.1.105
|
80 |
+
imageio==2.35.1
|
81 |
+
Brotli==1.1.0
|
82 |
+
bitsandbytes==0.44.1
|
83 |
+
hpsv2==1.2.0
|
84 |
+
lark==1.2.2
|
85 |
+
gradio==5.0.2
|
86 |
+
pydantic==2.9.2
|
87 |
+
pytz==2024.2
|
88 |
+
jsonschema-specifications==2024.10.1
|
89 |
+
deepspeed==0.15.2
|
90 |
+
cloudpickle==3.1.0
|
91 |
+
distro==1.9.0
|
92 |
+
aiohappyeyeballs==2.4.3
|
93 |
+
Markdown==3.7
|
94 |
+
docker-pycreds==0.4.0
|
95 |
+
semantic-version==2.10.0
|
96 |
+
resampy==0.4.3
|
97 |
+
urllib3==2.2.3
|
98 |
+
nodeenv==1.9.1
|
99 |
+
click==8.1.7
|
100 |
+
accelerate==1.0.1
|
101 |
+
dill==0.3.8
|
102 |
+
setproctitle==1.3.3
|
103 |
+
httpcore==1.0.6
|
104 |
+
pooch==1.8.2
|
105 |
+
importlib_metadata==8.5.0
|
106 |
+
cfgv==3.4.0
|
107 |
+
einops==0.8.0
|
108 |
+
shellingham==1.5.4
|
109 |
+
pytest==7.2.0
|
110 |
+
python-dotenv==1.0.1
|
111 |
+
pydub==0.25.1
|
112 |
+
kiwisolver==1.4.7
|
113 |
+
aiofiles==23.2.1
|
114 |
+
vllm==0.6.2
|
115 |
+
Werkzeug==3.0.4
|
116 |
+
tensorboard==2.18.0
|
117 |
+
joblib==1.4.2
|
118 |
+
pycryptodomex==3.21.0
|
119 |
+
moviepy==1.0.3
|
120 |
+
typing_extensions==4.12.2
|
121 |
+
mdurl==0.1.2
|
122 |
+
mistral_common==1.4.4
|
123 |
+
rich==13.9.2
|
124 |
+
aiosignal==1.3.1
|
125 |
+
mmsg==0.1.dev20+g585c63a.d20241012
|
126 |
+
pillow==10.4.0
|
127 |
+
prometheus_client==0.21.0
|
128 |
+
nvidia-cusolver-cu12==11.4.5.107
|
129 |
+
typer==0.12.5
|
130 |
+
pyzmq==26.2.0
|
131 |
+
h11==0.14.0
|
132 |
+
gitdb==4.0.11
|
133 |
+
transformers==4.44.0.dev0
|
134 |
+
nvidia-nccl-cu12==2.20.5
|
135 |
+
jsonschema==4.23.0
|
136 |
+
soundfile==0.12.1
|
137 |
+
contourpy==1.3.0
|
138 |
+
mutagen==1.47.0
|
139 |
+
regex==2024.9.11
|
140 |
+
orjson==3.10.7
|
141 |
+
fairscale==0.4.13
|
142 |
+
partial-json-parser==0.2.1.1.post4
|
143 |
+
outlines==0.1.1.dev4+ga2fd35c
|
144 |
+
nvidia-curand-cu12==10.3.2.106
|
145 |
+
pluggy==1.5.0
|
146 |
+
GitPython==3.1.43
|
147 |
+
tzdata==2024.2
|
148 |
+
uvicorn==0.31.1
|
149 |
+
sentencepiece==0.2.0
|
150 |
+
decorator==4.4.2
|
151 |
+
nvidia-nvjitlink-cu12==12.6.77
|
152 |
+
distlib==0.3.9
|
153 |
+
uvloop==0.20.0
|
154 |
+
networkx==3.4.1
|
155 |
+
wcwidth==0.2.13
|
156 |
+
opencv-python==4.6.0.66
|
157 |
+
six==1.16.0
|
158 |
+
httptools==0.6.1
|
159 |
+
safetensors==0.4.5
|
160 |
+
nvidia-nvtx-cu12==12.1.105
|
161 |
+
markdown-it-py==3.0.0
|
162 |
+
certifi==2024.8.30
|
163 |
+
sentry-sdk==2.16.0
|
164 |
+
outlines_core==0.1.0
|
165 |
+
threadpoolctl==3.5.0
|
166 |
+
nvidia-cufft-cu12==11.0.2.54
|
167 |
+
datasets==3.0.1
|
168 |
+
cycler==0.12.1
|
169 |
+
psutil==6.0.0
|
170 |
+
nvidia-cusparse-cu12==12.1.0.106
|
171 |
+
shortuuid==1.0.13
|
172 |
+
ffmpy==0.4.0
|
173 |
+
xformers==0.0.27.post2
|
174 |
+
MarkupSafe==2.1.5
|
175 |
+
tqdm==4.66.5
|
176 |
+
gradio_client==1.4.0
|
177 |
+
attrs==24.2.0
|
178 |
+
optree==0.13.0
|
179 |
+
PyYAML==6.0.2
|
180 |
+
clint==0.5.1
|
181 |
+
torchaudio==2.4.0
|
182 |
+
frechet-audio-distance==0.1.2
|
183 |
+
frozenlist==1.4.1
|
184 |
+
clip==0.2.0
|
185 |
+
multidict==6.1.0
|
186 |
+
propcache==0.2.0
|
187 |
+
librosa==0.10.2.post1
|
188 |
+
webdataset==0.2.100
|
189 |
+
ray==2.37.0
|
190 |
+
pyparsing==3.1.4
|
191 |
+
pyarrow==17.0.0
|
192 |
+
tiktoken==0.7.0
|
193 |
+
watchfiles==0.24.0
|
194 |
+
proglog==0.1.10
|
195 |
+
cachetools==5.5.0
|
196 |
+
fonttools==4.54.1
|
197 |
+
charset-normalizer==3.4.0
|
198 |
+
ftfy==6.3.0
|
199 |
+
referencing==0.35.1
|
200 |
+
mpmath==1.3.0
|
201 |
+
msgspec==0.18.6
|
202 |
+
nvidia-ml-py==12.535.161
|
203 |
+
smmap==5.0.1
|
204 |
+
absl-py==2.1.0
|
205 |
+
python-multipart==0.0.12
|
206 |
+
Pygments==2.18.0
|
207 |
+
iniconfig==2.0.0
|
208 |
+
sympy==1.13.3
|
209 |
+
pip==24.2
|
210 |
+
airportsdata==20241001
|
211 |
+
tomlkit==0.12.0
|
212 |
+
nest-asyncio==1.6.0
|
213 |
+
setuptools==75.1.0
|
214 |
+
jiter==0.6.1
|
215 |
+
cffi==1.17.1
|
216 |
+
nvitop==1.3.2
|
217 |
+
backports.tarfile==1.2.0
|
218 |
+
zipp==3.19.2
|
219 |
+
inflect==7.3.1
|
220 |
+
autocommand==2.2.2
|
221 |
+
importlib_resources==6.4.0
|
222 |
+
packaging==24.1
|
223 |
+
jaraco.context==5.3.0
|
224 |
+
typeguard==4.3.0
|
225 |
+
more-itertools==10.3.0
|
226 |
+
jaraco.text==3.12.1
|
227 |
+
platformdirs==4.2.2
|
228 |
+
wheel==0.43.0
|
229 |
+
typing_extensions==4.12.2
|
230 |
+
importlib_metadata==8.0.0
|
231 |
+
tomli==2.0.1
|
232 |
+
jaraco.collections==5.1.0
|
233 |
+
jaraco.functools==4.0.1
|
wandb/run-20241025_180620-eoegk43l/files/wandb-metadata.json
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.4.0-198-generic-x86_64-with-glibc2.31",
|
3 |
+
"python": "3.11.10",
|
4 |
+
"startedAt": "2024-10-25T18:06:20.375892Z",
|
5 |
+
"args": [
|
6 |
+
"--local_rank=0",
|
7 |
+
"--actor_model_name_or_path",
|
8 |
+
"/data/align-anything/hantao/models/0916_ti_to_ti_sft",
|
9 |
+
"--reward_model_name_or_path",
|
10 |
+
"/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400",
|
11 |
+
"--reward_critic_model_name_or_path",
|
12 |
+
"/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400",
|
13 |
+
"--train_datasets",
|
14 |
+
"/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs",
|
15 |
+
"--train_template",
|
16 |
+
"spavl_ti2ti",
|
17 |
+
"--train_data_files",
|
18 |
+
"ti2ti_preference_filtered_tokenize_full.pt",
|
19 |
+
"--ptx_template",
|
20 |
+
"spavl_ti2ti",
|
21 |
+
"--ptx_data_files",
|
22 |
+
"ti2ti_ptx_27k.pt",
|
23 |
+
"--output_dir",
|
24 |
+
"../outputs/ppo_ti2ti_baseline_1025_with_eval",
|
25 |
+
"--save_interval",
|
26 |
+
"30"
|
27 |
+
],
|
28 |
+
"program": "-m align_anything.trainers.text_image_to_text_image.ppo",
|
29 |
+
"git": {
|
30 |
+
"remote": "https://github.com/PKU-Alignment/align-anything.git",
|
31 |
+
"commit": "6fde660afc9985323f147930eedf188a5699adc7"
|
32 |
+
},
|
33 |
+
"email": "[email protected]",
|
34 |
+
"root": "../outputs/ppo_ti2ti_baseline_1025_with_eval",
|
35 |
+
"host": "lyg0195",
|
36 |
+
"username": "align-anything",
|
37 |
+
"executable": "/home/align-anything/miniconda3/envs/hantao_cham/bin/python",
|
38 |
+
"cpu_count": 64,
|
39 |
+
"cpu_count_logical": 128,
|
40 |
+
"gpu": "[NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB]",
|
41 |
+
"gpu_count": 8,
|
42 |
+
"disk": {
|
43 |
+
"/": {
|
44 |
+
"total": "938421047296",
|
45 |
+
"used": "363102785536"
|
46 |
+
}
|
47 |
+
},
|
48 |
+
"memory": {
|
49 |
+
"total": "540647575552"
|
50 |
+
},
|
51 |
+
"cpu": {
|
52 |
+
"count": 64,
|
53 |
+
"countLogical": 128
|
54 |
+
},
|
55 |
+
"gpu_nvidia": [
|
56 |
+
{
|
57 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
58 |
+
"memoryTotal": "85899345920",
|
59 |
+
"cudaCores": 6912,
|
60 |
+
"architecture": "Ampere"
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
64 |
+
"memoryTotal": "85899345920",
|
65 |
+
"cudaCores": 6912,
|
66 |
+
"architecture": "Ampere"
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
70 |
+
"memoryTotal": "85899345920",
|
71 |
+
"cudaCores": 6912,
|
72 |
+
"architecture": "Ampere"
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
76 |
+
"memoryTotal": "85899345920",
|
77 |
+
"cudaCores": 6912,
|
78 |
+
"architecture": "Ampere"
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
82 |
+
"memoryTotal": "85899345920",
|
83 |
+
"cudaCores": 6912,
|
84 |
+
"architecture": "Ampere"
|
85 |
+
},
|
86 |
+
{
|
87 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
88 |
+
"memoryTotal": "85899345920",
|
89 |
+
"cudaCores": 6912,
|
90 |
+
"architecture": "Ampere"
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
94 |
+
"memoryTotal": "85899345920",
|
95 |
+
"cudaCores": 6912,
|
96 |
+
"architecture": "Ampere"
|
97 |
+
},
|
98 |
+
{
|
99 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
100 |
+
"memoryTotal": "85899345920",
|
101 |
+
"cudaCores": 6912,
|
102 |
+
"architecture": "Ampere"
|
103 |
+
}
|
104 |
+
],
|
105 |
+
"cudaVersion": "12.4"
|
106 |
+
}
|
wandb/run-20241025_180620-eoegk43l/logs/debug-internal.log
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2024-10-25T18:06:20.381500476Z","level":"INFO","msg":"using version","core version":"0.18.3"}
|
2 |
+
{"time":"2024-10-25T18:06:20.381537039Z","level":"INFO","msg":"created symlink","path":"../outputs/ppo_ti2ti_baseline_1025_with_eval/wandb/run-20241025_180620-eoegk43l/logs/debug-core.log"}
|
3 |
+
{"time":"2024-10-25T18:06:20.386238205Z","level":"ERROR","msg":"dialing: google: could not find default credentials. See https://cloud.google.com/docs/authentication/external/set-up-adc for more information"}
|
4 |
+
{"time":"2024-10-25T18:06:20.408793678Z","level":"INFO","msg":"created new stream","id":"eoegk43l"}
|
5 |
+
{"time":"2024-10-25T18:06:20.408868821Z","level":"INFO","msg":"stream: started","id":"eoegk43l"}
|
6 |
+
{"time":"2024-10-25T18:06:20.408926558Z","level":"INFO","msg":"sender: started","stream_id":{"value":"eoegk43l"}}
|
7 |
+
{"time":"2024-10-25T18:06:20.408909461Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"eoegk43l"}}
|
8 |
+
{"time":"2024-10-25T18:06:20.408921169Z","level":"INFO","msg":"handler: started","stream_id":{"value":"eoegk43l"}}
|
9 |
+
{"time":"2024-10-25T18:06:21.029852323Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
|
10 |
+
{"time":"2024-10-25T18:06:21.033290741Z","level":"INFO","msg":"Starting system monitor"}
|
wandb/run-20241025_180620-eoegk43l/logs/debug.log
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-10-25 18:06:20,368 INFO MainThread:935352 [wandb_setup.py:_flush():79] Current SDK version is 0.18.3
|
2 |
+
2024-10-25 18:06:20,368 INFO MainThread:935352 [wandb_setup.py:_flush():79] Configure stats pid to 935352
|
3 |
+
2024-10-25 18:06:20,369 INFO MainThread:935352 [wandb_setup.py:_flush():79] Loading settings from /home/align-anything/.config/wandb/settings
|
4 |
+
2024-10-25 18:06:20,369 INFO MainThread:935352 [wandb_setup.py:_flush():79] Loading settings from /data/align-anything/hantao/align-anything/scripts/wandb/settings
|
5 |
+
2024-10-25 18:06:20,369 INFO MainThread:935352 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'api_key': '***REDACTED***', 'mode': 'online'}
|
6 |
+
2024-10-25 18:06:20,369 INFO MainThread:935352 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
|
7 |
+
2024-10-25 18:06:20,369 WARNING MainThread:935352 [wandb_setup.py:_flush():79] Could not find program at -m align_anything.trainers.text_image_to_text_image.ppo
|
8 |
+
2024-10-25 18:06:20,369 INFO MainThread:935352 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m align_anything.trainers.text_image_to_text_image.ppo'}
|
9 |
+
2024-10-25 18:06:20,369 INFO MainThread:935352 [wandb_setup.py:_flush():79] Applying login settings: {}
|
10 |
+
2024-10-25 18:06:20,369 INFO MainThread:935352 [wandb_init.py:_log_setup():532] Logging user logs to ../outputs/ppo_ti2ti_baseline_1025_with_eval/wandb/run-20241025_180620-eoegk43l/logs/debug.log
|
11 |
+
2024-10-25 18:06:20,369 INFO MainThread:935352 [wandb_init.py:_log_setup():533] Logging internal logs to ../outputs/ppo_ti2ti_baseline_1025_with_eval/wandb/run-20241025_180620-eoegk43l/logs/debug-internal.log
|
12 |
+
2024-10-25 18:06:20,369 INFO MainThread:935352 [wandb_init.py:init():617] calling init triggers
|
13 |
+
2024-10-25 18:06:20,369 INFO MainThread:935352 [wandb_init.py:init():624] wandb.init called with sweep_config: {}
|
14 |
+
config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_prompt_batch_size': 8, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 2, 'actor_gradient_checkpointing': True, 'critic_gradient_checkpointing': True, 'actor_lr': 1e-05, 'actor_lr_scheduler_type': 'cosine', 'actor_lr_warmup_ratio': 0.03, 'actor_weight_decay': 0.01, 'critic_lr': 5e-06, 'critic_lr_scheduler_type': 'constant', 'critic_lr_warmup_ratio': 0.03, 'critic_weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'kl_coeff': 0.02, 'clip_range_ratio': 0.2, 'clip_range_score': 50.0, 'clip_range_value': 5.0, 'ptx_coeff': 16.0, 'gamma': 1.0, 'gae_lambda': 0.95, 'normalize_reward': False, 'update_iters': 1, 'freeze_mm_proj': True, 'freeze_vision_tower': False, 'freeze_language_model': True}, 'data_cfgs': {'train_datasets': '/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs', 'train_template': 'spavl_ti2ti', 'train_size': 5000, 'train_split': None, 'train_subset': None, 'train_data_files': 'ti2ti_preference_filtered_tokenize_full.pt', 'train_optional_args': [], 'eval_datasets': None, 'eval_template': None, 'eval_size': None, 'eval_split': None, 'eval_subset': None, 'eval_data_files': None, 'eval_optional_args': [], 'ptx_datasets': None, 'ptx_template': 'spavl_ti2ti', 'ptx_size': None, 'ptx_subset': None, 'ptx_split': None, 'ptx_data_files': 'ti2ti_ptx_27k.pt', 'ptx_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'ppo', 'output_dir': '../outputs/ppo_ti2ti_baseline_1025_with_eval', 'cache_dir': None, 'save_interval': 30.0}, 'model_cfgs': {'actor_model_name_or_path': '/data/align-anything/hantao/models/0916_ti_to_ti_sft', 'reward_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400', 'reward_critic_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400', 'trust_remote_code': True, 'model_max_length': 2048, 'temperature': 1.0, 'top_p': 1.0, 'repetition_penalty': 1.0}, 'lora_cfgs': {'use_lora': False, 'task_type': 'TaskType.CAUSAL_LM', 'inference_mode': False, 'r': 16, 'lora_alpha': 16, 'lora_dropout': 0.1, 'target_modules': ['q_proj', 'v_proj'], 'save_full_model': True}, 'bnb_cfgs': {'use_bnb': False, 'load_in_4bit': True, 'load_in_8bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'float16'}, 'special_tokens': None}
|
15 |
+
2024-10-25 18:06:20,370 INFO MainThread:935352 [wandb_init.py:init():667] starting backend
|
16 |
+
2024-10-25 18:06:20,370 INFO MainThread:935352 [wandb_init.py:init():671] sending inform_init request
|
17 |
+
2024-10-25 18:06:20,374 INFO MainThread:935352 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
18 |
+
2024-10-25 18:06:20,375 INFO MainThread:935352 [wandb_init.py:init():684] backend started and connected
|
19 |
+
2024-10-25 18:06:20,379 INFO MainThread:935352 [wandb_init.py:init():779] updated telemetry
|
20 |
+
2024-10-25 18:06:20,389 INFO MainThread:935352 [wandb_init.py:init():812] communicating run to backend with 90.0 second timeout
|
21 |
+
2024-10-25 18:06:21,024 INFO MainThread:935352 [wandb_init.py:init():863] starting run threads in backend
|
22 |
+
2024-10-25 18:06:21,186 INFO MainThread:935352 [wandb_run.py:_console_start():2465] atexit reg
|
23 |
+
2024-10-25 18:06:21,186 INFO MainThread:935352 [wandb_run.py:_redirect():2313] redirect: wrap_raw
|
24 |
+
2024-10-25 18:06:21,186 INFO MainThread:935352 [wandb_run.py:_redirect():2378] Wrapping output streams.
|
25 |
+
2024-10-25 18:06:21,186 INFO MainThread:935352 [wandb_run.py:_redirect():2403] Redirects installed.
|
26 |
+
2024-10-25 18:06:21,189 INFO MainThread:935352 [wandb_init.py:init():907] run started, returning control to user process
|
wandb/run-20241025_180620-eoegk43l/run-eoegk43l.wandb
ADDED
File without changes
|
wandb/run-20241025_181518-qbvp2oju/files/config.yaml
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_wandb:
|
2 |
+
value:
|
3 |
+
cli_version: 0.18.3
|
4 |
+
m: []
|
5 |
+
python_version: 3.11.10
|
6 |
+
t:
|
7 |
+
"1":
|
8 |
+
- 1
|
9 |
+
- 11
|
10 |
+
- 41
|
11 |
+
- 49
|
12 |
+
- 51
|
13 |
+
- 55
|
14 |
+
- 71
|
15 |
+
- 83
|
16 |
+
- 98
|
17 |
+
- 105
|
18 |
+
"2":
|
19 |
+
- 1
|
20 |
+
- 11
|
21 |
+
- 41
|
22 |
+
- 49
|
23 |
+
- 51
|
24 |
+
- 55
|
25 |
+
- 71
|
26 |
+
- 83
|
27 |
+
- 98
|
28 |
+
- 105
|
29 |
+
"3":
|
30 |
+
- 2
|
31 |
+
- 13
|
32 |
+
- 16
|
33 |
+
- 23
|
34 |
+
- 55
|
35 |
+
- 61
|
36 |
+
"4": 3.11.10
|
37 |
+
"5": 0.18.3
|
38 |
+
"6": 4.44.0.dev0
|
39 |
+
"8":
|
40 |
+
- 5
|
41 |
+
"12": 0.18.3
|
42 |
+
"13": linux-x86_64
|
43 |
+
bnb_cfgs:
|
44 |
+
value:
|
45 |
+
bnb_4bit_compute_dtype: float16
|
46 |
+
bnb_4bit_quant_type: nf4
|
47 |
+
bnb_4bit_use_double_quant: true
|
48 |
+
load_in_4bit: true
|
49 |
+
load_in_8bit: false
|
50 |
+
use_bnb: false
|
51 |
+
data_cfgs:
|
52 |
+
value:
|
53 |
+
eval_data_files: null
|
54 |
+
eval_datasets: null
|
55 |
+
eval_optional_args: []
|
56 |
+
eval_size: null
|
57 |
+
eval_split: null
|
58 |
+
eval_subset: null
|
59 |
+
eval_template: null
|
60 |
+
ptx_data_files: ti2ti_ptx_27k.pt
|
61 |
+
ptx_datasets: null
|
62 |
+
ptx_optional_args: []
|
63 |
+
ptx_size: null
|
64 |
+
ptx_split: null
|
65 |
+
ptx_subset: null
|
66 |
+
ptx_template: spavl_ti2ti
|
67 |
+
train_data_files: ti2ti_llf_prompt_only_tokenize.pt
|
68 |
+
train_datasets: /data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs
|
69 |
+
train_optional_args: []
|
70 |
+
train_size: 5000
|
71 |
+
train_split: null
|
72 |
+
train_subset: null
|
73 |
+
train_template: spavl_ti2ti
|
74 |
+
logger_cfgs:
|
75 |
+
value:
|
76 |
+
cache_dir: null
|
77 |
+
log_project: align-anything
|
78 |
+
log_run_name: ppo
|
79 |
+
log_type: wandb
|
80 |
+
output_dir: ../outputs/ppo_ti2ti_baseline_1025_with_eval
|
81 |
+
save_interval: 30
|
82 |
+
lora_cfgs:
|
83 |
+
value:
|
84 |
+
inference_mode: false
|
85 |
+
lora_alpha: 16
|
86 |
+
lora_dropout: 0.1
|
87 |
+
r: 16
|
88 |
+
save_full_model: true
|
89 |
+
target_modules:
|
90 |
+
- q_proj
|
91 |
+
- v_proj
|
92 |
+
task_type: TaskType.CAUSAL_LM
|
93 |
+
use_lora: false
|
94 |
+
model_cfgs:
|
95 |
+
value:
|
96 |
+
actor_model_name_or_path: /data/align-anything/hantao/models/0916_ti_to_ti_sft
|
97 |
+
model_max_length: 2048
|
98 |
+
repetition_penalty: 1
|
99 |
+
reward_critic_model_name_or_path: /data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400
|
100 |
+
reward_model_name_or_path: /data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400
|
101 |
+
temperature: 1
|
102 |
+
top_p: 1
|
103 |
+
trust_remote_code: true
|
104 |
+
special_tokens:
|
105 |
+
value: null
|
106 |
+
train_cfgs:
|
107 |
+
value:
|
108 |
+
actor_gradient_checkpointing: true
|
109 |
+
actor_lr: 1e-05
|
110 |
+
actor_lr_scheduler_type: cosine
|
111 |
+
actor_lr_warmup_ratio: 0.03
|
112 |
+
actor_weight_decay: 0.01
|
113 |
+
adam_betas:
|
114 |
+
- 0.9
|
115 |
+
- 0.95
|
116 |
+
bf16: true
|
117 |
+
clip_range_ratio: 0.2
|
118 |
+
clip_range_score: 50
|
119 |
+
clip_range_value: 5
|
120 |
+
critic_gradient_checkpointing: true
|
121 |
+
critic_lr: 5e-06
|
122 |
+
critic_lr_scheduler_type: constant
|
123 |
+
critic_lr_warmup_ratio: 0.03
|
124 |
+
critic_weight_decay: 0
|
125 |
+
ds_cfgs: ds_z3_config.json
|
126 |
+
epochs: 3
|
127 |
+
eval_interval: 10
|
128 |
+
eval_strategy: epoch
|
129 |
+
fp16: false
|
130 |
+
freeze_language_model: true
|
131 |
+
freeze_mm_proj: true
|
132 |
+
freeze_vision_tower: false
|
133 |
+
gae_lambda: 0.95
|
134 |
+
gamma: 1
|
135 |
+
gradient_accumulation_steps: 2
|
136 |
+
kl_coeff: 0.02
|
137 |
+
normalize_reward: false
|
138 |
+
per_device_eval_batch_size: 8
|
139 |
+
per_device_prompt_batch_size: 8
|
140 |
+
per_device_train_batch_size: 8
|
141 |
+
ptx_coeff: 16
|
142 |
+
seed: 42
|
143 |
+
update_iters: 1
|
wandb/run-20241025_181518-qbvp2oju/files/output.log
ADDED
@@ -0,0 +1,307 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
***** Running training *****
|
2 |
+
Training 1/3 epoch: 0%| | 0/237 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
|
3 |
+
|
4 |
+
[2024-10-25 18:23:22,854] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
5 |
+
[2024-10-25 18:23:27,196] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
6 |
+
[2024-10-25 18:27:32,436] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
7 |
+
[2024-10-25 18:27:37,446] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
8 |
+
[2024-10-25 18:32:36,133] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
9 |
+
[2024-10-25 18:32:41,569] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
10 |
+
[2024-10-25 18:36:08,160] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
11 |
+
[2024-10-25 18:36:12,414] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
12 |
+
[2024-10-25 18:39:02,940] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
13 |
+
[2024-10-25 18:39:07,161] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
14 |
+
[2024-10-25 18:41:58,177] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
15 |
+
[2024-10-25 18:42:02,629] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
16 |
+
[2024-10-25 18:43:57,357] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
17 |
+
[2024-10-25 18:44:01,125] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
18 |
+
[2024-10-25 18:45:40,196] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
19 |
+
[2024-10-25 18:45:44,071] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
20 |
+
[2024-10-25 18:47:40,184] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
21 |
+
[2024-10-25 18:47:40,185] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[9.908858470377793e-06, 9.908858470377793e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
22 |
+
[2024-10-25 18:47:40,185] [INFO] [timer.py:264:stop] epoch=0/micro_step=20/global_step=10, RunningAvgSamplesPerSec=14.760689632781455, CurrSamplesPerSec=16.90950583092757, MemAllocated=33.11GB, MaxMemAllocated=47.22GB
|
23 |
+
[2024-10-25 18:47:43,802] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
24 |
+
[2024-10-25 18:47:43,802] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
25 |
+
[2024-10-25 18:47:43,803] [INFO] [timer.py:264:stop] epoch=0/micro_step=20/global_step=10, RunningAvgSamplesPerSec=15.436775313806873, CurrSamplesPerSec=17.42788205848213, MemAllocated=33.11GB, MaxMemAllocated=47.22GB
|
26 |
+
[2024-10-25 18:48:48,658] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
27 |
+
[2024-10-25 18:48:52,182] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
28 |
+
[2024-10-25 18:49:14,049] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
29 |
+
[2024-10-25 18:49:17,288] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
30 |
+
[2024-10-25 18:49:38,002] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
31 |
+
[2024-10-25 18:49:41,318] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
32 |
+
[2024-10-25 18:50:04,280] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
33 |
+
[2024-10-25 18:50:07,681] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
34 |
+
Saving checkpoint at step 30 ...
|
35 |
+
Saving model to "../outputs/ppo_ti2ti_baseline_1025_with_eval" ...
|
36 |
+
Saving 16-bit model...
|
37 |
+
[2024-10-25 18:50:21,078] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step15 is about to be saved!
|
38 |
+
[2024-10-25 18:50:21,079] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_30.bin, tag: global_step15
|
39 |
+
[2024-10-25 18:50:21,079] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_30.bin...
|
40 |
+
[2024-10-25 18:50:36,754] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_30.bin.
|
41 |
+
[2024-10-25 18:50:36,755] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step15 is ready now!
|
42 |
+
Model saved!
|
43 |
+
Saving 16-bit model...
|
44 |
+
[2024-10-25 18:50:47,513] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step15 is about to be saved!
|
45 |
+
[2024-10-25 18:50:47,514] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_30.bin, tag: global_step15
|
46 |
+
[2024-10-25 18:50:47,514] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_30.bin...
|
47 |
+
[2024-10-25 18:51:06,021] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_30.bin.
|
48 |
+
[2024-10-25 18:51:06,021] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step15 is ready now!
|
49 |
+
Model saved!
|
50 |
+
Model saved!
|
51 |
+
Checkpoint saved.
|
52 |
+
[2024-10-25 18:51:27,052] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
53 |
+
[2024-10-25 18:51:30,347] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
54 |
+
[2024-10-25 18:51:50,999] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
55 |
+
[2024-10-25 18:51:54,300] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
56 |
+
[2024-10-25 18:52:19,971] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
57 |
+
[2024-10-25 18:52:23,379] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
58 |
+
[2024-10-25 18:52:45,642] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
59 |
+
[2024-10-25 18:52:48,945] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
60 |
+
[2024-10-25 18:53:09,437] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
61 |
+
[2024-10-25 18:53:09,438] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[9.470431355738257e-06, 9.470431355738257e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
62 |
+
[2024-10-25 18:53:09,438] [INFO] [timer.py:264:stop] epoch=0/micro_step=40/global_step=20, RunningAvgSamplesPerSec=16.735259629101844, CurrSamplesPerSec=18.7639256774801, MemAllocated=33.18GB, MaxMemAllocated=47.22GB
|
63 |
+
[2024-10-25 18:53:12,725] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
64 |
+
[2024-10-25 18:53:12,725] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
65 |
+
[2024-10-25 18:53:12,726] [INFO] [timer.py:264:stop] epoch=0/micro_step=40/global_step=20, RunningAvgSamplesPerSec=17.458411197853028, CurrSamplesPerSec=19.786118059003716, MemAllocated=33.18GB, MaxMemAllocated=47.22GB
|
66 |
+
[2024-10-25 18:54:19,714] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
67 |
+
[2024-10-25 18:54:23,021] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
68 |
+
[2024-10-25 18:55:07,181] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
69 |
+
[2024-10-25 18:55:10,462] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
70 |
+
[2024-10-25 18:55:30,696] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
71 |
+
[2024-10-25 18:55:34,030] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
72 |
+
[2024-10-25 18:55:54,073] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
73 |
+
[2024-10-25 18:55:57,348] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
74 |
+
[2024-10-25 18:57:04,748] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[8.70045279830626e-06, 8.70045279830626e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
75 |
+
[2024-10-25 18:57:04,749] [INFO] [timer.py:264:stop] epoch=0/micro_step=60/global_step=30, RunningAvgSamplesPerSec=17.50081132260227, CurrSamplesPerSec=19.186032450205442, MemAllocated=33.15GB, MaxMemAllocated=47.22GB
|
76 |
+
[2024-10-25 18:57:08,050] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
77 |
+
[2024-10-25 18:57:08,051] [INFO] [timer.py:264:stop] epoch=0/micro_step=60/global_step=30, RunningAvgSamplesPerSec=18.204578219677924, CurrSamplesPerSec=19.780692782398315, MemAllocated=33.15GB, MaxMemAllocated=47.22GB
|
78 |
+
Saving checkpoint at step 60 ...
|
79 |
+
Saving model to "../outputs/ppo_ti2ti_baseline_1025_with_eval" ...
|
80 |
+
Saving 16-bit model...
|
81 |
+
[2024-10-25 18:57:21,740] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step30 is about to be saved!
|
82 |
+
[2024-10-25 18:57:21,741] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_60.bin, tag: global_step30
|
83 |
+
[2024-10-25 18:57:21,741] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_60.bin...
|
84 |
+
[2024-10-25 18:57:38,185] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_60.bin.
|
85 |
+
[2024-10-25 18:57:38,186] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step30 is ready now!
|
86 |
+
Model saved!
|
87 |
+
Saving 16-bit model...
|
88 |
+
[2024-10-25 18:57:46,170] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step30 is about to be saved!
|
89 |
+
[2024-10-25 18:57:46,171] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_60.bin, tag: global_step30
|
90 |
+
[2024-10-25 18:57:46,171] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_60.bin...
|
91 |
+
[2024-10-25 18:58:04,694] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_60.bin.
|
92 |
+
[2024-10-25 18:58:04,694] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step30 is ready now!
|
93 |
+
Model saved!
|
94 |
+
Model saved!
|
95 |
+
Checkpoint saved.
|
96 |
+
[2024-10-25 18:59:12,139] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
97 |
+
[2024-10-25 18:59:15,443] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
98 |
+
[2024-10-25 18:59:35,560] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
99 |
+
[2024-10-25 18:59:38,857] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
100 |
+
[2024-10-25 19:01:48,333] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=0, lr=[7.656028585269017e-06, 7.656028585269017e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
101 |
+
[2024-10-25 19:01:48,334] [INFO] [timer.py:264:stop] epoch=0/micro_step=80/global_step=40, RunningAvgSamplesPerSec=18.027035548451977, CurrSamplesPerSec=29.34925562325487, MemAllocated=33.13GB, MaxMemAllocated=47.22GB
|
102 |
+
[2024-10-25 19:01:51,645] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
103 |
+
[2024-10-25 19:01:51,646] [INFO] [timer.py:264:stop] epoch=0/micro_step=80/global_step=40, RunningAvgSamplesPerSec=18.74402073106821, CurrSamplesPerSec=30.575559928357755, MemAllocated=33.13GB, MaxMemAllocated=47.22GB
|
104 |
+
[2024-10-25 19:02:12,030] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
105 |
+
[2024-10-25 19:02:15,280] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
106 |
+
[2024-10-25 19:02:58,678] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
107 |
+
[2024-10-25 19:03:01,948] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
108 |
+
[2024-10-25 19:03:22,304] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
109 |
+
[2024-10-25 19:03:25,588] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
110 |
+
[2024-10-25 19:03:45,571] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
111 |
+
[2024-10-25 19:03:48,886] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
112 |
+
Saving checkpoint at step 90 ...
|
113 |
+
Saving model to "../outputs/ppo_ti2ti_baseline_1025_with_eval" ...
|
114 |
+
Saving 16-bit model...
|
115 |
+
[2024-10-25 19:04:00,883] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step45 is about to be saved!
|
116 |
+
[2024-10-25 19:04:00,885] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_90.bin, tag: global_step45
|
117 |
+
[2024-10-25 19:04:00,885] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_90.bin...
|
118 |
+
[2024-10-25 19:04:18,000] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_90.bin.
|
119 |
+
[2024-10-25 19:04:18,001] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step45 is ready now!
|
120 |
+
Model saved!
|
121 |
+
Saving 16-bit model...
|
122 |
+
[2024-10-25 19:04:26,278] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step45 is about to be saved!
|
123 |
+
[2024-10-25 19:04:26,279] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_90.bin, tag: global_step45
|
124 |
+
[2024-10-25 19:04:26,279] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_90.bin...
|
125 |
+
[2024-10-25 19:04:45,735] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_90.bin.
|
126 |
+
[2024-10-25 19:04:45,737] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step45 is ready now!
|
127 |
+
Model saved!
|
128 |
+
Model saved!
|
129 |
+
Checkpoint saved.
|
130 |
+
[2024-10-25 19:06:15,770] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
131 |
+
[2024-10-25 19:06:19,105] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
132 |
+
[2024-10-25 19:06:39,673] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
133 |
+
[2024-10-25 19:06:39,674] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=0, lr=[6.41461888258465e-06, 6.41461888258465e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
134 |
+
[2024-10-25 19:06:39,675] [INFO] [timer.py:264:stop] epoch=0/micro_step=100/global_step=50, RunningAvgSamplesPerSec=18.196574738389252, CurrSamplesPerSec=18.16333305616454, MemAllocated=33.11GB, MaxMemAllocated=47.22GB
|
135 |
+
[2024-10-25 19:06:43,151] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
136 |
+
[2024-10-25 19:06:43,152] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
137 |
+
[2024-10-25 19:06:43,153] [INFO] [timer.py:264:stop] epoch=0/micro_step=100/global_step=50, RunningAvgSamplesPerSec=18.924891588094603, CurrSamplesPerSec=19.22042501406237, MemAllocated=33.11GB, MaxMemAllocated=47.22GB
|
138 |
+
[2024-10-25 19:08:14,506] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
139 |
+
[2024-10-25 19:08:17,754] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
140 |
+
[2024-10-25 19:09:25,071] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
141 |
+
[2024-10-25 19:09:28,498] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
142 |
+
[2024-10-25 19:10:35,663] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
143 |
+
[2024-10-25 19:10:35,664] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=0, lr=[5.068293368829755e-06, 5.068293368829755e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
144 |
+
[2024-10-25 19:10:35,665] [INFO] [timer.py:264:stop] epoch=0/micro_step=120/global_step=60, RunningAvgSamplesPerSec=18.322746746684054, CurrSamplesPerSec=19.371088818318672, MemAllocated=33.14GB, MaxMemAllocated=47.22GB
|
145 |
+
[2024-10-25 19:10:38,940] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
146 |
+
[2024-10-25 19:10:38,940] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
147 |
+
[2024-10-25 19:10:38,941] [INFO] [timer.py:264:stop] epoch=0/micro_step=120/global_step=60, RunningAvgSamplesPerSec=19.029045377480585, CurrSamplesPerSec=19.888041858045842, MemAllocated=33.14GB, MaxMemAllocated=47.22GB
|
148 |
+
Saving checkpoint at step 120 ...
|
149 |
+
Saving model to "../outputs/ppo_ti2ti_baseline_1025_with_eval" ...
|
150 |
+
Saving 16-bit model...
|
151 |
+
[2024-10-25 19:10:54,426] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step60 is about to be saved!
|
152 |
+
[2024-10-25 19:10:54,428] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_120.bin, tag: global_step60
|
153 |
+
[2024-10-25 19:10:54,428] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_120.bin...
|
154 |
+
[2024-10-25 19:11:13,388] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_120.bin.
|
155 |
+
[2024-10-25 19:11:13,390] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step60 is ready now!
|
156 |
+
Model saved!
|
157 |
+
Saving 16-bit model...
|
158 |
+
[2024-10-25 19:11:22,464] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step60 is about to be saved!
|
159 |
+
[2024-10-25 19:11:22,465] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_120.bin, tag: global_step60
|
160 |
+
[2024-10-25 19:11:22,466] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_120.bin...
|
161 |
+
[2024-10-25 19:11:39,535] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_120.bin.
|
162 |
+
[2024-10-25 19:11:39,535] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step60 is ready now!
|
163 |
+
Model saved!
|
164 |
+
Model saved!
|
165 |
+
Checkpoint saved.
|
166 |
+
[2024-10-25 19:11:59,374] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
167 |
+
[2024-10-25 19:12:02,660] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
168 |
+
[2024-10-25 19:12:22,755] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
169 |
+
[2024-10-25 19:12:26,038] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
170 |
+
[2024-10-25 19:12:46,474] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
171 |
+
[2024-10-25 19:12:49,933] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
172 |
+
[2024-10-25 19:13:10,102] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
173 |
+
[2024-10-25 19:13:13,472] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
174 |
+
[2024-10-25 19:14:20,491] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
175 |
+
[2024-10-25 19:14:23,905] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
176 |
+
[2024-10-25 19:15:07,328] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
177 |
+
[2024-10-25 19:15:10,628] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
178 |
+
[2024-10-25 19:15:30,619] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=0, lr=[3.7169028483301333e-06, 3.7169028483301333e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
179 |
+
[2024-10-25 19:15:30,619] [INFO] [timer.py:264:stop] epoch=0/micro_step=140/global_step=70, RunningAvgSamplesPerSec=18.421273695026542, CurrSamplesPerSec=19.229435871608736, MemAllocated=33.13GB, MaxMemAllocated=47.22GB
|
180 |
+
[2024-10-25 19:15:33,910] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
181 |
+
[2024-10-25 19:15:33,911] [INFO] [timer.py:264:stop] epoch=0/micro_step=140/global_step=70, RunningAvgSamplesPerSec=19.12443035480107, CurrSamplesPerSec=19.765868285523876, MemAllocated=33.13GB, MaxMemAllocated=47.22GB
|
182 |
+
[2024-10-25 19:15:53,825] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
183 |
+
[2024-10-25 19:15:57,143] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
184 |
+
[2024-10-25 19:16:17,587] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
185 |
+
[2024-10-25 19:16:20,871] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
186 |
+
[2024-10-25 19:16:40,921] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
187 |
+
[2024-10-25 19:16:44,213] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
188 |
+
[2024-10-25 19:17:04,343] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
189 |
+
[2024-10-25 19:17:07,629] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
190 |
+
Saving checkpoint at step 150 ...
|
191 |
+
Saving model to "../outputs/ppo_ti2ti_baseline_1025_with_eval" ...
|
192 |
+
Saving 16-bit model...
|
193 |
+
[2024-10-25 19:17:42,434] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step75 is about to be saved!
|
194 |
+
[2024-10-25 19:17:42,436] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_150.bin, tag: global_step75
|
195 |
+
[2024-10-25 19:17:42,436] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_150.bin...
|
196 |
+
[2024-10-25 19:18:02,484] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_150.bin.
|
197 |
+
[2024-10-25 19:18:02,486] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step75 is ready now!
|
198 |
+
Model saved!
|
199 |
+
Saving 16-bit model...
|
200 |
+
[2024-10-25 19:18:11,754] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step75 is about to be saved!
|
201 |
+
[2024-10-25 19:18:11,755] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_150.bin, tag: global_step75
|
202 |
+
[2024-10-25 19:18:11,755] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_150.bin...
|
203 |
+
[2024-10-25 19:18:28,942] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_150.bin.
|
204 |
+
[2024-10-25 19:18:28,944] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step75 is ready now!
|
205 |
+
Model saved!
|
206 |
+
Model saved!
|
207 |
+
Checkpoint saved.
|
208 |
+
[2024-10-25 19:18:48,635] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
209 |
+
[2024-10-25 19:18:51,897] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
210 |
+
[2024-10-25 19:20:14,068] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
211 |
+
[2024-10-25 19:20:14,069] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=0, lr=[2.4606737737909696e-06, 2.4606737737909696e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
212 |
+
[2024-10-25 19:20:14,070] [INFO] [timer.py:264:stop] epoch=0/micro_step=160/global_step=80, RunningAvgSamplesPerSec=18.583796025985635, CurrSamplesPerSec=18.851139953128662, MemAllocated=33.29GB, MaxMemAllocated=47.22GB
|
213 |
+
[2024-10-25 19:20:17,500] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
214 |
+
[2024-10-25 19:20:17,500] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
215 |
+
[2024-10-25 19:20:17,501] [INFO] [timer.py:264:stop] epoch=0/micro_step=160/global_step=80, RunningAvgSamplesPerSec=19.286477338971316, CurrSamplesPerSec=19.326753490145233, MemAllocated=33.29GB, MaxMemAllocated=47.22GB
|
216 |
+
[2024-10-25 19:21:24,498] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
217 |
+
[2024-10-25 19:21:27,888] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
218 |
+
[2024-10-25 19:21:47,842] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
219 |
+
[2024-10-25 19:21:51,137] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
220 |
+
[2024-10-25 19:22:11,123] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
221 |
+
[2024-10-25 19:22:14,405] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
222 |
+
[2024-10-25 19:22:57,887] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
223 |
+
[2024-10-25 19:23:01,180] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
224 |
+
[2024-10-25 19:23:44,745] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
225 |
+
[2024-10-25 19:23:48,058] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
226 |
+
[2024-10-25 19:24:08,920] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=0, lr=[1.3927749088052218e-06, 1.3927749088052218e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
227 |
+
[2024-10-25 19:24:08,921] [INFO] [timer.py:264:stop] epoch=0/micro_step=180/global_step=90, RunningAvgSamplesPerSec=18.631064887248623, CurrSamplesPerSec=18.182446863655244, MemAllocated=33.13GB, MaxMemAllocated=47.22GB
|
228 |
+
[2024-10-25 19:24:12,425] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
229 |
+
[2024-10-25 19:24:12,426] [INFO] [timer.py:264:stop] epoch=0/micro_step=180/global_step=90, RunningAvgSamplesPerSec=19.325484075258544, CurrSamplesPerSec=18.68740115377941, MemAllocated=33.13GB, MaxMemAllocated=47.22GB
|
230 |
+
Saving checkpoint at step 180 ...
|
231 |
+
Saving model to "../outputs/ppo_ti2ti_baseline_1025_with_eval" ...
|
232 |
+
Saving 16-bit model...
|
233 |
+
[2024-10-25 19:24:23,716] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step90 is about to be saved!
|
234 |
+
[2024-10-25 19:24:23,717] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_180.bin, tag: global_step90
|
235 |
+
[2024-10-25 19:24:23,717] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_180.bin...
|
236 |
+
[2024-10-25 19:24:41,475] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_180.bin.
|
237 |
+
[2024-10-25 19:24:41,476] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step90 is ready now!
|
238 |
+
Model saved!
|
239 |
+
Saving 16-bit model...
|
240 |
+
[2024-10-25 19:24:50,478] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step90 is about to be saved!
|
241 |
+
[2024-10-25 19:24:50,479] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_180.bin, tag: global_step90
|
242 |
+
[2024-10-25 19:24:50,480] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_180.bin...
|
243 |
+
[2024-10-25 19:25:12,036] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_180.bin.
|
244 |
+
[2024-10-25 19:25:12,039] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step90 is ready now!
|
245 |
+
Model saved!
|
246 |
+
Model saved!
|
247 |
+
Checkpoint saved.
|
248 |
+
[2024-10-25 19:27:06,252] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
249 |
+
[2024-10-25 19:27:09,573] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
250 |
+
[2024-10-25 19:28:40,360] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
251 |
+
[2024-10-25 19:28:43,635] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
252 |
+
[2024-10-25 19:29:03,608] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
253 |
+
[2024-10-25 19:29:03,610] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=0, lr=[5.924074268766422e-07, 5.924074268766422e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
254 |
+
[2024-10-25 19:29:03,610] [INFO] [timer.py:264:stop] epoch=0/micro_step=200/global_step=100, RunningAvgSamplesPerSec=18.66696592651746, CurrSamplesPerSec=19.246283042032882, MemAllocated=33.12GB, MaxMemAllocated=47.22GB
|
255 |
+
[2024-10-25 19:29:06,852] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
256 |
+
[2024-10-25 19:29:06,853] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
257 |
+
[2024-10-25 19:29:06,853] [INFO] [timer.py:264:stop] epoch=0/micro_step=200/global_step=100, RunningAvgSamplesPerSec=19.358645620408343, CurrSamplesPerSec=20.0045335178505, MemAllocated=33.12GB, MaxMemAllocated=47.22GB
|
258 |
+
[2024-10-25 19:30:37,368] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
259 |
+
[2024-10-25 19:30:40,629] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
260 |
+
[2024-10-25 19:31:00,782] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
261 |
+
[2024-10-25 19:31:04,118] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
262 |
+
Saving checkpoint at step 210 ...
|
263 |
+
Saving model to "../outputs/ppo_ti2ti_baseline_1025_with_eval" ...
|
264 |
+
Saving 16-bit model...
|
265 |
+
[2024-10-25 19:31:14,905] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step105 is about to be saved!
|
266 |
+
[2024-10-25 19:31:14,906] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_210.bin, tag: global_step105
|
267 |
+
[2024-10-25 19:31:14,907] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_210.bin...
|
268 |
+
[2024-10-25 19:31:30,468] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_210.bin.
|
269 |
+
[2024-10-25 19:31:30,471] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step105 is ready now!
|
270 |
+
Model saved!
|
271 |
+
Saving 16-bit model...
|
272 |
+
[2024-10-25 19:31:37,840] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step105 is about to be saved!
|
273 |
+
[2024-10-25 19:31:37,842] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_210.bin, tag: global_step105
|
274 |
+
[2024-10-25 19:31:37,842] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_210.bin...
|
275 |
+
[2024-10-25 19:31:59,787] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model_210.bin.
|
276 |
+
[2024-10-25 19:31:59,790] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step105 is ready now!
|
277 |
+
Model saved!
|
278 |
+
Model saved!
|
279 |
+
Checkpoint saved.
|
280 |
+
[2024-10-25 19:33:53,254] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=0, lr=[1.1893092270227724e-07, 1.1893092270227724e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
281 |
+
[2024-10-25 19:33:53,255] [INFO] [timer.py:264:stop] epoch=0/micro_step=220/global_step=110, RunningAvgSamplesPerSec=18.708463115192874, CurrSamplesPerSec=19.308977655910134, MemAllocated=33.13GB, MaxMemAllocated=47.22GB
|
282 |
+
[2024-10-25 19:33:56,503] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=0, lr=[5e-06, 5e-06], mom=[[0.9, 0.95], [0.9, 0.95]]
|
283 |
+
[2024-10-25 19:33:56,504] [INFO] [timer.py:264:stop] epoch=0/micro_step=220/global_step=110, RunningAvgSamplesPerSec=19.393453277752876, CurrSamplesPerSec=19.924871782255472, MemAllocated=33.13GB, MaxMemAllocated=47.22GB
|
284 |
+
[2024-10-25 19:34:40,368] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
285 |
+
[2024-10-25 19:34:43,678] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
286 |
+
[2024-10-25 19:35:03,709] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
287 |
+
[2024-10-25 19:35:07,004] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
288 |
+
[2024-10-25 19:35:50,234] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
289 |
+
[2024-10-25 19:35:53,480] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
290 |
+
[2024-10-25 19:36:13,581] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
291 |
+
[2024-10-25 19:36:16,864] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
292 |
+
Saving model to "../outputs/ppo_ti2ti_baseline_1025_with_eval" ...
|
293 |
+
Saving 16-bit model...
|
294 |
+
[2024-10-25 19:37:19,241] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step118 is about to be saved!
|
295 |
+
[2024-10-25 19:37:19,242] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model.bin, tag: global_step118
|
296 |
+
[2024-10-25 19:37:19,243] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model.bin...
|
297 |
+
[2024-10-25 19:37:40,063] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model.bin.
|
298 |
+
[2024-10-25 19:37:40,065] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step118 is ready now!
|
299 |
+
Model saved!
|
300 |
+
Saving 16-bit model...
|
301 |
+
[2024-10-25 19:37:49,384] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step118 is about to be saved!
|
302 |
+
[2024-10-25 19:37:49,385] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model.bin, tag: global_step118
|
303 |
+
[2024-10-25 19:37:49,386] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model.bin...
|
304 |
+
[2024-10-25 19:38:13,508] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ../outputs/ppo_ti2ti_baseline_1025_with_eval/pytorch_model.bin.
|
305 |
+
[2024-10-25 19:38:13,511] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step118 is ready now!
|
306 |
+
Model saved!
|
307 |
+
Model saved!
|
wandb/run-20241025_181518-qbvp2oju/files/requirements.txt
ADDED
@@ -0,0 +1,233 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
align-anything==0.0.1.dev0
|
2 |
+
torch==2.4.0
|
3 |
+
pycparser==2.22
|
4 |
+
torchvision==0.19.0
|
5 |
+
multiprocess==0.70.16
|
6 |
+
braceexpand==0.1.7
|
7 |
+
lm-format-enforcer==0.10.6
|
8 |
+
Jinja2==3.1.4
|
9 |
+
scikit-learn==1.5.2
|
10 |
+
interegular==0.3.3
|
11 |
+
starlette==0.38.6
|
12 |
+
huggingface-hub==0.25.2
|
13 |
+
pyairports==2.1.1
|
14 |
+
protobuf==3.20.3
|
15 |
+
term-image==0.7.2
|
16 |
+
python-dateutil==2.9.0.post0
|
17 |
+
identify==2.6.1
|
18 |
+
tokenizers==0.19.1
|
19 |
+
tensorboard-data-server==0.7.2
|
20 |
+
numba==0.60.0
|
21 |
+
ninja==1.11.1.1
|
22 |
+
nvidia-cuda-cupti-cu12==12.1.105
|
23 |
+
diskcache==5.6.3
|
24 |
+
pycountry==24.6.1
|
25 |
+
py-cpuinfo==9.0.0
|
26 |
+
scipy==1.14.1
|
27 |
+
soxr==0.5.0.post1
|
28 |
+
prometheus-fastapi-instrumentator==7.0.0
|
29 |
+
align-anything==0.0.1.dev0
|
30 |
+
virtualenv==20.26.6
|
31 |
+
hjson==3.1.0
|
32 |
+
nvidia-cudnn-cu12==9.1.0.70
|
33 |
+
termcolor==2.5.0
|
34 |
+
grpcio==1.66.2
|
35 |
+
wheel==0.44.0
|
36 |
+
torchlibrosa==0.1.0
|
37 |
+
numpy==1.26.4
|
38 |
+
msgpack==1.1.0
|
39 |
+
rpds-py==0.20.0
|
40 |
+
annotated-types==0.7.0
|
41 |
+
pre_commit==4.0.1
|
42 |
+
aiohttp==3.10.10
|
43 |
+
audioread==3.0.1
|
44 |
+
lazy_loader==0.4
|
45 |
+
nvidia-cuda-runtime-cu12==12.1.105
|
46 |
+
filelock==3.16.1
|
47 |
+
timm==0.6.13
|
48 |
+
anyio==4.6.0
|
49 |
+
pydantic_core==2.23.4
|
50 |
+
idna==3.10
|
51 |
+
fastapi==0.115.0
|
52 |
+
wandb==0.18.3
|
53 |
+
packaging==24.1
|
54 |
+
yt-dlp==2024.8.6
|
55 |
+
matplotlib==3.9.2
|
56 |
+
websockets==12.0
|
57 |
+
triton==3.0.0
|
58 |
+
zipp==3.20.2
|
59 |
+
requests==2.32.3
|
60 |
+
xxhash==3.5.0
|
61 |
+
image-reward==1.5
|
62 |
+
pytorch-fid==0.3.0
|
63 |
+
imageio-ffmpeg==0.5.1
|
64 |
+
args==0.1.0
|
65 |
+
llvmlite==0.43.0
|
66 |
+
peft==0.13.2
|
67 |
+
openai==1.51.2
|
68 |
+
httpx==0.27.2
|
69 |
+
nvidia-cublas-cu12==12.1.3.1
|
70 |
+
pytest-split==0.8.0
|
71 |
+
ruff==0.6.9
|
72 |
+
sniffio==1.3.1
|
73 |
+
yarl==1.15.0
|
74 |
+
pandas==2.2.3
|
75 |
+
fsspec==2024.6.1
|
76 |
+
gguf==0.10.0
|
77 |
+
diffusers==0.30.3
|
78 |
+
platformdirs==4.3.6
|
79 |
+
nvidia-cuda-nvrtc-cu12==12.1.105
|
80 |
+
imageio==2.35.1
|
81 |
+
Brotli==1.1.0
|
82 |
+
bitsandbytes==0.44.1
|
83 |
+
hpsv2==1.2.0
|
84 |
+
lark==1.2.2
|
85 |
+
gradio==5.0.2
|
86 |
+
pydantic==2.9.2
|
87 |
+
pytz==2024.2
|
88 |
+
jsonschema-specifications==2024.10.1
|
89 |
+
deepspeed==0.15.2
|
90 |
+
cloudpickle==3.1.0
|
91 |
+
distro==1.9.0
|
92 |
+
aiohappyeyeballs==2.4.3
|
93 |
+
Markdown==3.7
|
94 |
+
docker-pycreds==0.4.0
|
95 |
+
semantic-version==2.10.0
|
96 |
+
resampy==0.4.3
|
97 |
+
urllib3==2.2.3
|
98 |
+
nodeenv==1.9.1
|
99 |
+
click==8.1.7
|
100 |
+
accelerate==1.0.1
|
101 |
+
dill==0.3.8
|
102 |
+
setproctitle==1.3.3
|
103 |
+
httpcore==1.0.6
|
104 |
+
pooch==1.8.2
|
105 |
+
importlib_metadata==8.5.0
|
106 |
+
cfgv==3.4.0
|
107 |
+
einops==0.8.0
|
108 |
+
shellingham==1.5.4
|
109 |
+
pytest==7.2.0
|
110 |
+
python-dotenv==1.0.1
|
111 |
+
pydub==0.25.1
|
112 |
+
kiwisolver==1.4.7
|
113 |
+
aiofiles==23.2.1
|
114 |
+
vllm==0.6.2
|
115 |
+
Werkzeug==3.0.4
|
116 |
+
tensorboard==2.18.0
|
117 |
+
joblib==1.4.2
|
118 |
+
pycryptodomex==3.21.0
|
119 |
+
moviepy==1.0.3
|
120 |
+
typing_extensions==4.12.2
|
121 |
+
mdurl==0.1.2
|
122 |
+
mistral_common==1.4.4
|
123 |
+
rich==13.9.2
|
124 |
+
aiosignal==1.3.1
|
125 |
+
mmsg==0.1.dev20+g585c63a.d20241012
|
126 |
+
pillow==10.4.0
|
127 |
+
prometheus_client==0.21.0
|
128 |
+
nvidia-cusolver-cu12==11.4.5.107
|
129 |
+
typer==0.12.5
|
130 |
+
pyzmq==26.2.0
|
131 |
+
h11==0.14.0
|
132 |
+
gitdb==4.0.11
|
133 |
+
transformers==4.44.0.dev0
|
134 |
+
nvidia-nccl-cu12==2.20.5
|
135 |
+
jsonschema==4.23.0
|
136 |
+
soundfile==0.12.1
|
137 |
+
contourpy==1.3.0
|
138 |
+
mutagen==1.47.0
|
139 |
+
regex==2024.9.11
|
140 |
+
orjson==3.10.7
|
141 |
+
fairscale==0.4.13
|
142 |
+
partial-json-parser==0.2.1.1.post4
|
143 |
+
outlines==0.1.1.dev4+ga2fd35c
|
144 |
+
nvidia-curand-cu12==10.3.2.106
|
145 |
+
pluggy==1.5.0
|
146 |
+
GitPython==3.1.43
|
147 |
+
tzdata==2024.2
|
148 |
+
uvicorn==0.31.1
|
149 |
+
sentencepiece==0.2.0
|
150 |
+
decorator==4.4.2
|
151 |
+
nvidia-nvjitlink-cu12==12.6.77
|
152 |
+
distlib==0.3.9
|
153 |
+
uvloop==0.20.0
|
154 |
+
networkx==3.4.1
|
155 |
+
wcwidth==0.2.13
|
156 |
+
opencv-python==4.6.0.66
|
157 |
+
six==1.16.0
|
158 |
+
httptools==0.6.1
|
159 |
+
safetensors==0.4.5
|
160 |
+
nvidia-nvtx-cu12==12.1.105
|
161 |
+
markdown-it-py==3.0.0
|
162 |
+
certifi==2024.8.30
|
163 |
+
sentry-sdk==2.16.0
|
164 |
+
outlines_core==0.1.0
|
165 |
+
threadpoolctl==3.5.0
|
166 |
+
nvidia-cufft-cu12==11.0.2.54
|
167 |
+
datasets==3.0.1
|
168 |
+
cycler==0.12.1
|
169 |
+
psutil==6.0.0
|
170 |
+
nvidia-cusparse-cu12==12.1.0.106
|
171 |
+
shortuuid==1.0.13
|
172 |
+
ffmpy==0.4.0
|
173 |
+
xformers==0.0.27.post2
|
174 |
+
MarkupSafe==2.1.5
|
175 |
+
tqdm==4.66.5
|
176 |
+
gradio_client==1.4.0
|
177 |
+
attrs==24.2.0
|
178 |
+
optree==0.13.0
|
179 |
+
PyYAML==6.0.2
|
180 |
+
clint==0.5.1
|
181 |
+
torchaudio==2.4.0
|
182 |
+
frechet-audio-distance==0.1.2
|
183 |
+
frozenlist==1.4.1
|
184 |
+
clip==0.2.0
|
185 |
+
multidict==6.1.0
|
186 |
+
propcache==0.2.0
|
187 |
+
librosa==0.10.2.post1
|
188 |
+
webdataset==0.2.100
|
189 |
+
ray==2.37.0
|
190 |
+
pyparsing==3.1.4
|
191 |
+
pyarrow==17.0.0
|
192 |
+
tiktoken==0.7.0
|
193 |
+
watchfiles==0.24.0
|
194 |
+
proglog==0.1.10
|
195 |
+
cachetools==5.5.0
|
196 |
+
fonttools==4.54.1
|
197 |
+
charset-normalizer==3.4.0
|
198 |
+
ftfy==6.3.0
|
199 |
+
referencing==0.35.1
|
200 |
+
mpmath==1.3.0
|
201 |
+
msgspec==0.18.6
|
202 |
+
nvidia-ml-py==12.535.161
|
203 |
+
smmap==5.0.1
|
204 |
+
absl-py==2.1.0
|
205 |
+
python-multipart==0.0.12
|
206 |
+
Pygments==2.18.0
|
207 |
+
iniconfig==2.0.0
|
208 |
+
sympy==1.13.3
|
209 |
+
pip==24.2
|
210 |
+
airportsdata==20241001
|
211 |
+
tomlkit==0.12.0
|
212 |
+
nest-asyncio==1.6.0
|
213 |
+
setuptools==75.1.0
|
214 |
+
jiter==0.6.1
|
215 |
+
cffi==1.17.1
|
216 |
+
nvitop==1.3.2
|
217 |
+
backports.tarfile==1.2.0
|
218 |
+
zipp==3.19.2
|
219 |
+
inflect==7.3.1
|
220 |
+
autocommand==2.2.2
|
221 |
+
importlib_resources==6.4.0
|
222 |
+
packaging==24.1
|
223 |
+
jaraco.context==5.3.0
|
224 |
+
typeguard==4.3.0
|
225 |
+
more-itertools==10.3.0
|
226 |
+
jaraco.text==3.12.1
|
227 |
+
platformdirs==4.2.2
|
228 |
+
wheel==0.43.0
|
229 |
+
typing_extensions==4.12.2
|
230 |
+
importlib_metadata==8.0.0
|
231 |
+
tomli==2.0.1
|
232 |
+
jaraco.collections==5.1.0
|
233 |
+
jaraco.functools==4.0.1
|
wandb/run-20241025_181518-qbvp2oju/files/wandb-metadata.json
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.4.0-198-generic-x86_64-with-glibc2.31",
|
3 |
+
"python": "3.11.10",
|
4 |
+
"startedAt": "2024-10-25T18:15:18.983727Z",
|
5 |
+
"args": [
|
6 |
+
"--local_rank=0",
|
7 |
+
"--actor_model_name_or_path",
|
8 |
+
"/data/align-anything/hantao/models/0916_ti_to_ti_sft",
|
9 |
+
"--reward_model_name_or_path",
|
10 |
+
"/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400",
|
11 |
+
"--reward_critic_model_name_or_path",
|
12 |
+
"/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400",
|
13 |
+
"--train_datasets",
|
14 |
+
"/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs",
|
15 |
+
"--train_template",
|
16 |
+
"spavl_ti2ti",
|
17 |
+
"--train_data_files",
|
18 |
+
"ti2ti_llf_prompt_only_tokenize.pt",
|
19 |
+
"--ptx_template",
|
20 |
+
"spavl_ti2ti",
|
21 |
+
"--ptx_data_files",
|
22 |
+
"ti2ti_ptx_27k.pt",
|
23 |
+
"--output_dir",
|
24 |
+
"../outputs/ppo_ti2ti_baseline_1025_with_eval",
|
25 |
+
"--save_interval",
|
26 |
+
"30"
|
27 |
+
],
|
28 |
+
"program": "-m align_anything.trainers.text_image_to_text_image.ppo",
|
29 |
+
"git": {
|
30 |
+
"remote": "https://github.com/PKU-Alignment/align-anything.git",
|
31 |
+
"commit": "6fde660afc9985323f147930eedf188a5699adc7"
|
32 |
+
},
|
33 |
+
"email": "[email protected]",
|
34 |
+
"root": "../outputs/ppo_ti2ti_baseline_1025_with_eval",
|
35 |
+
"host": "lyg0195",
|
36 |
+
"username": "align-anything",
|
37 |
+
"executable": "/home/align-anything/miniconda3/envs/hantao_cham/bin/python",
|
38 |
+
"cpu_count": 64,
|
39 |
+
"cpu_count_logical": 128,
|
40 |
+
"gpu": "[NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB]",
|
41 |
+
"gpu_count": 8,
|
42 |
+
"disk": {
|
43 |
+
"/": {
|
44 |
+
"total": "938421047296",
|
45 |
+
"used": "363102883840"
|
46 |
+
}
|
47 |
+
},
|
48 |
+
"memory": {
|
49 |
+
"total": "540647575552"
|
50 |
+
},
|
51 |
+
"cpu": {
|
52 |
+
"count": 64,
|
53 |
+
"countLogical": 128
|
54 |
+
},
|
55 |
+
"gpu_nvidia": [
|
56 |
+
{
|
57 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
58 |
+
"memoryTotal": "85899345920",
|
59 |
+
"cudaCores": 6912,
|
60 |
+
"architecture": "Ampere"
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
64 |
+
"memoryTotal": "85899345920",
|
65 |
+
"cudaCores": 6912,
|
66 |
+
"architecture": "Ampere"
|
67 |
+
},
|
68 |
+
{
|
69 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
70 |
+
"memoryTotal": "85899345920",
|
71 |
+
"cudaCores": 6912,
|
72 |
+
"architecture": "Ampere"
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
76 |
+
"memoryTotal": "85899345920",
|
77 |
+
"cudaCores": 6912,
|
78 |
+
"architecture": "Ampere"
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
82 |
+
"memoryTotal": "85899345920",
|
83 |
+
"cudaCores": 6912,
|
84 |
+
"architecture": "Ampere"
|
85 |
+
},
|
86 |
+
{
|
87 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
88 |
+
"memoryTotal": "85899345920",
|
89 |
+
"cudaCores": 6912,
|
90 |
+
"architecture": "Ampere"
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
94 |
+
"memoryTotal": "85899345920",
|
95 |
+
"cudaCores": 6912,
|
96 |
+
"architecture": "Ampere"
|
97 |
+
},
|
98 |
+
{
|
99 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
100 |
+
"memoryTotal": "85899345920",
|
101 |
+
"cudaCores": 6912,
|
102 |
+
"architecture": "Ampere"
|
103 |
+
}
|
104 |
+
],
|
105 |
+
"cudaVersion": "12.4"
|
106 |
+
}
|
wandb/run-20241025_181518-qbvp2oju/files/wandb-summary.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"train/reward_critic_lr":5e-06,"train/reward_with_kl_penalty":-0.1957259476184845,"_wandb":{"runtime":4974},"_timestamp":1.729885027561649e+09,"train/reward_value":-0.89080810546875,"train/mean_generated_length":1,"train/actor_lr":0,"_step":236,"train/actor_loss":-0.6950821280479431,"train/max_generated_length":1,"train/kl_divergence":4.3175482749938965,"_runtime":4974.609715617,"train/reward_advantage":0.6950821280479431,"train/reward_return":-0.1957259476184845,"train/reward":-0.109375,"train/step":236,"train/reward_critic_loss":0.5039339065551758}
|
wandb/run-20241025_181518-qbvp2oju/logs/debug-internal.log
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2024-10-25T18:15:18.987617848Z","level":"INFO","msg":"using version","core version":"0.18.3"}
|
2 |
+
{"time":"2024-10-25T18:15:18.987649473Z","level":"INFO","msg":"created symlink","path":"../outputs/ppo_ti2ti_baseline_1025_with_eval/wandb/run-20241025_181518-qbvp2oju/logs/debug-core.log"}
|
3 |
+
{"time":"2024-10-25T18:15:18.991400712Z","level":"ERROR","msg":"dialing: google: could not find default credentials. See https://cloud.google.com/docs/authentication/external/set-up-adc for more information"}
|
4 |
+
{"time":"2024-10-25T18:15:19.015335603Z","level":"INFO","msg":"created new stream","id":"qbvp2oju"}
|
5 |
+
{"time":"2024-10-25T18:15:19.015397376Z","level":"INFO","msg":"stream: started","id":"qbvp2oju"}
|
6 |
+
{"time":"2024-10-25T18:15:19.015408377Z","level":"INFO","msg":"handler: started","stream_id":{"value":"qbvp2oju"}}
|
7 |
+
{"time":"2024-10-25T18:15:19.015432033Z","level":"INFO","msg":"sender: started","stream_id":{"value":"qbvp2oju"}}
|
8 |
+
{"time":"2024-10-25T18:15:19.015437112Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"qbvp2oju"}}
|
9 |
+
{"time":"2024-10-25T18:15:20.634593869Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
|
10 |
+
{"time":"2024-10-25T18:15:20.637814914Z","level":"INFO","msg":"Starting system monitor"}
|
11 |
+
{"time":"2024-10-25T19:38:13.593466266Z","level":"INFO","msg":"Stopping system monitor"}
|
12 |
+
{"time":"2024-10-25T19:38:13.627014655Z","level":"INFO","msg":"Stopped system monitor"}
|
13 |
+
{"time":"2024-10-25T19:38:14.559855674Z","level":"WARN","msg":"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"}
|
14 |
+
{"time":"2024-10-25T19:38:14.559906183Z","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
|
15 |
+
{"time":"2024-10-25T19:38:15.545457735Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
16 |
+
{"time":"2024-10-25T19:38:17.12240115Z","level":"INFO","msg":"stream: closing","id":"qbvp2oju"}
|
17 |
+
{"time":"2024-10-25T19:38:17.12243525Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"qbvp2oju"}}
|
18 |
+
{"time":"2024-10-25T19:38:17.122460489Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"qbvp2oju"}}
|
19 |
+
{"time":"2024-10-25T19:38:17.122575437Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"qbvp2oju"}}
|
20 |
+
{"time":"2024-10-25T19:38:17.124870943Z","level":"INFO","msg":"stream: closed","id":"qbvp2oju"}
|
wandb/run-20241025_181518-qbvp2oju/logs/debug.log
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Current SDK version is 0.18.3
|
2 |
+
2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Configure stats pid to 937440
|
3 |
+
2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Loading settings from /home/align-anything/.config/wandb/settings
|
4 |
+
2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Loading settings from /data/align-anything/hantao/align-anything/scripts/wandb/settings
|
5 |
+
2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'api_key': '***REDACTED***', 'mode': 'online'}
|
6 |
+
2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
|
7 |
+
2024-10-25 18:15:18,977 WARNING MainThread:937440 [wandb_setup.py:_flush():79] Could not find program at -m align_anything.trainers.text_image_to_text_image.ppo
|
8 |
+
2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m align_anything.trainers.text_image_to_text_image.ppo'}
|
9 |
+
2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_setup.py:_flush():79] Applying login settings: {}
|
10 |
+
2024-10-25 18:15:18,977 INFO MainThread:937440 [wandb_init.py:_log_setup():532] Logging user logs to ../outputs/ppo_ti2ti_baseline_1025_with_eval/wandb/run-20241025_181518-qbvp2oju/logs/debug.log
|
11 |
+
2024-10-25 18:15:18,978 INFO MainThread:937440 [wandb_init.py:_log_setup():533] Logging internal logs to ../outputs/ppo_ti2ti_baseline_1025_with_eval/wandb/run-20241025_181518-qbvp2oju/logs/debug-internal.log
|
12 |
+
2024-10-25 18:15:18,978 INFO MainThread:937440 [wandb_init.py:init():617] calling init triggers
|
13 |
+
2024-10-25 18:15:18,978 INFO MainThread:937440 [wandb_init.py:init():624] wandb.init called with sweep_config: {}
|
14 |
+
config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3, 'seed': 42, 'per_device_prompt_batch_size': 8, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 2, 'actor_gradient_checkpointing': True, 'critic_gradient_checkpointing': True, 'actor_lr': 1e-05, 'actor_lr_scheduler_type': 'cosine', 'actor_lr_warmup_ratio': 0.03, 'actor_weight_decay': 0.01, 'critic_lr': 5e-06, 'critic_lr_scheduler_type': 'constant', 'critic_lr_warmup_ratio': 0.03, 'critic_weight_decay': 0.0, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'kl_coeff': 0.02, 'clip_range_ratio': 0.2, 'clip_range_score': 50.0, 'clip_range_value': 5.0, 'ptx_coeff': 16.0, 'gamma': 1.0, 'gae_lambda': 0.95, 'normalize_reward': False, 'update_iters': 1, 'freeze_mm_proj': True, 'freeze_vision_tower': False, 'freeze_language_model': True}, 'data_cfgs': {'train_datasets': '/data/align-anything/hantao/align-anything/projects/text_image_to_text_image/outputs', 'train_template': 'spavl_ti2ti', 'train_size': 5000, 'train_split': None, 'train_subset': None, 'train_data_files': 'ti2ti_llf_prompt_only_tokenize.pt', 'train_optional_args': [], 'eval_datasets': None, 'eval_template': None, 'eval_size': None, 'eval_split': None, 'eval_subset': None, 'eval_data_files': None, 'eval_optional_args': [], 'ptx_datasets': None, 'ptx_template': 'spavl_ti2ti', 'ptx_size': None, 'ptx_subset': None, 'ptx_split': None, 'ptx_data_files': 'ti2ti_ptx_27k.pt', 'ptx_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'ppo', 'output_dir': '../outputs/ppo_ti2ti_baseline_1025_with_eval', 'cache_dir': None, 'save_interval': 30.0}, 'model_cfgs': {'actor_model_name_or_path': '/data/align-anything/hantao/models/0916_ti_to_ti_sft', 'reward_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400', 'reward_critic_model_name_or_path': '/data/align-anything/hantao/align-anything/outputs/rm_ti2ti_baseline_1025_with_eval/slice_2400', 'trust_remote_code': True, 'model_max_length': 2048, 'temperature': 1.0, 'top_p': 1.0, 'repetition_penalty': 1.0}, 'lora_cfgs': {'use_lora': False, 'task_type': 'TaskType.CAUSAL_LM', 'inference_mode': False, 'r': 16, 'lora_alpha': 16, 'lora_dropout': 0.1, 'target_modules': ['q_proj', 'v_proj'], 'save_full_model': True}, 'bnb_cfgs': {'use_bnb': False, 'load_in_4bit': True, 'load_in_8bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'float16'}, 'special_tokens': None}
|
15 |
+
2024-10-25 18:15:18,978 INFO MainThread:937440 [wandb_init.py:init():667] starting backend
|
16 |
+
2024-10-25 18:15:18,978 INFO MainThread:937440 [wandb_init.py:init():671] sending inform_init request
|
17 |
+
2024-10-25 18:15:18,982 INFO MainThread:937440 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
18 |
+
2024-10-25 18:15:18,983 INFO MainThread:937440 [wandb_init.py:init():684] backend started and connected
|
19 |
+
2024-10-25 18:15:18,986 INFO MainThread:937440 [wandb_init.py:init():779] updated telemetry
|
20 |
+
2024-10-25 18:15:18,996 INFO MainThread:937440 [wandb_init.py:init():812] communicating run to backend with 90.0 second timeout
|
21 |
+
2024-10-25 18:15:20,628 INFO MainThread:937440 [wandb_init.py:init():863] starting run threads in backend
|
22 |
+
2024-10-25 18:15:20,774 INFO MainThread:937440 [wandb_run.py:_console_start():2465] atexit reg
|
23 |
+
2024-10-25 18:15:20,774 INFO MainThread:937440 [wandb_run.py:_redirect():2313] redirect: wrap_raw
|
24 |
+
2024-10-25 18:15:20,774 INFO MainThread:937440 [wandb_run.py:_redirect():2378] Wrapping output streams.
|
25 |
+
2024-10-25 18:15:20,774 INFO MainThread:937440 [wandb_run.py:_redirect():2403] Redirects installed.
|
26 |
+
2024-10-25 18:15:20,776 INFO MainThread:937440 [wandb_init.py:init():907] run started, returning control to user process
|
27 |
+
2024-10-25 19:38:13,587 INFO MainThread:937440 [wandb_run.py:_finish():2164] finishing run htlou/align-anything/qbvp2oju
|
28 |
+
2024-10-25 19:38:13,590 INFO MainThread:937440 [wandb_run.py:_atexit_cleanup():2428] got exitcode: 0
|
29 |
+
2024-10-25 19:38:13,591 INFO MainThread:937440 [wandb_run.py:_restore():2410] restore
|
30 |
+
2024-10-25 19:38:13,592 INFO MainThread:937440 [wandb_run.py:_restore():2416] restore done
|
31 |
+
2024-10-25 19:38:17,104 INFO MainThread:937440 [wandb_run.py:_footer_history_summary_info():4049] rendering history
|
32 |
+
2024-10-25 19:38:17,107 INFO MainThread:937440 [wandb_run.py:_footer_history_summary_info():4081] rendering summary
|
33 |
+
2024-10-25 19:38:17,119 INFO MainThread:937440 [wandb_run.py:_footer_sync_info():4008] logging synced files
|
wandb/run-20241025_181518-qbvp2oju/run-qbvp2oju.wandb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:76ef46f86b21cc7b1a13903cb2d0aa221a447ddcdb4aff5e95e115b373ce98a4
|
3 |
+
size 4642995
|