yifanzhang114 commited on Jun 19

Commit

bfe2928

verified ·

1 Parent(s): 0e5943e

Upload folder using huggingface_hub

Browse files

Files changed (26) hide show

args.json +11 -12
global_step190/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
global_step190/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
global_step190/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
global_step190/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
global_step190/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
global_step190/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
global_step190/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
global_step190/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
global_step190/mp_rank_00_model_states.pt +3 -0
latest +1 -1
model-00001-of-00004.safetensors +1 -1
model-00002-of-00004.safetensors +1 -1
model-00003-of-00004.safetensors +1 -1
model-00004-of-00004.safetensors +1 -1
rng_state_0.pth +2 -2
rng_state_1.pth +2 -2
rng_state_2.pth +2 -2
rng_state_3.pth +2 -2
rng_state_4.pth +2 -2
rng_state_5.pth +2 -2
rng_state_6.pth +2 -2
rng_state_7.pth +2 -2
scheduler.pt +1 -1
trainer_state.json +0 -0
training_args.bin +2 -2

args.json CHANGED Viewed

@@ -14,7 +14,7 @@
   "init_strategy": null,
   "template": "qwen2_5_vl",
   "system": "You are a helpful assistant.\n\nSolve the following problem step by step, and optionally write Python code for image manipulation to enhance your reasoning process. The Python code will be executed by an external sandbox, and the processed image or result (wrapped in <sandbox_output></sandbox_output>) can be returned to aid your reasoning and help you arrive at the final answer.\n\n**Reasoning & Image Manipulation (Optional but Encouraged):**\n    * You have the capability to write executable Python code to perform image manipulations (e.g., cropping to a Region of Interest (ROI), resizing, rotation, adjusting contrast) or perform calculation for better reasoning.\n    * The code will be executed in a secure sandbox, and its output will be provided back to you for further analysis.\n    * All Python code snippets **must** be wrapped as follows:\n    <code>\n    ```python\n    # your code.\n    ```\n    </code>\n    * At the end of the code, print the path of the processed image (processed_path) or the result for further processing in a sandbox environment.",
-  "max_length": 20480,
   "truncation_strategy": "delete",
   "max_pixels": null,
   "agent_template": null,
@@ -27,11 +27,10 @@
   "use_chat_template": true,
   "template_backend": "swift",
   "dataset": [
-    "/mllm_hdd/yfzhang/Agent-R1/agent_latest_code/scripts/training_data/wo_system_image_180k_filter_w_image_size_filter_wo_some_code_4o_filter_1.jsonl",
     "/mllm_hdd/yfzhang/Agent-R1/agent_latest_code/scripts/training_data/vstar_2step_nooverlap_training_filtered_70k.jsonl",
     "/mllm_hdd/yfzhang/Agent-R1/agent_latest_code/scripts/training_data/vstar_2step_zoomin_training_filtered_70k.jsonl",
-    "/mllm_hdd/yfzhang/Agent-R1/agent_latest_code/scripts/training_data/wo_system_image_180k_filter_w_image_size_filter_wo_some_code_2round.jsonl",
-    "/mllm_hdd/yfzhang/Agent-R1/construct_data/math_and_chart/mm-eumath/data_gemini_code_processed_swift_train_chunk0_of_1.jsonl"
   ],
   "val_dataset": [],
   "split_dataset_ratio": 0.01,
@@ -92,7 +91,7 @@
   "ddp_backend": null,
   "ignore_args_error": false,
   "use_swift_lora": false,
-  "output_dir": "/mmu_mllm_hdd_2/yifanzhang/models/qwen_tool_all_data_200k_w_imagesize_3epoch_4096_all_2round_code_4ofilter1/v6-20250611-100511",
   "overwrite_output_dir": false,
   "do_train": false,
   "do_eval": false,
@@ -122,7 +121,7 @@
   "log_level": "passive",
   "log_level_replica": "warning",
   "log_on_each_node": true,
-  "logging_dir": "/mmu_mllm_hdd_2/yifanzhang/models/qwen_tool_all_data_200k_w_imagesize_3epoch_4096_all_2round_code_4ofilter1/v6-20250611-100511/runs",
   "logging_strategy": "steps",
   "logging_first_step": true,
   "logging_steps": 5,
@@ -152,10 +151,10 @@
   "debug": null,
   "dataloader_drop_last": false,
   "eval_steps": null,
-  "dataloader_num_workers": 0,
   "dataloader_prefetch_factor": null,
   "past_index": -1,
-  "run_name": "/mmu_mllm_hdd_2/yifanzhang/models/qwen_tool_all_data_200k_w_imagesize_3epoch_4096_all_2round_code_4ofilter1/v6-20250611-100511",
   "disable_tqdm": null,
   "label_names": null,
   "load_best_model_at_end": false,
@@ -209,7 +208,7 @@
   "group_by_length": false,
   "length_column_name": "length",
   "report_to": [
-    "none"
   ],
   "ddp_find_unused_parameters": null,
   "ddp_bucket_cap_mb": null,
@@ -219,7 +218,7 @@
   "skip_memory_metrics": true,
   "use_legacy_prediction_loop": false,
   "push_to_hub": false,
-  "resume_from_checkpoint": "/mmu_mllm_hdd_2/yifanzhang/models/qwen_tool_all_data_200k_w_imagesize_3epoch_4096_all_2round_code_4ofilter1/v4-20250610-121125/checkpoint-2477",
   "hub_model_id": null,
   "hub_strategy": "every_save",
   "hub_private_repo": null,
@@ -363,9 +362,9 @@
   "local_world_size": 8,
   "model_suffix": "Qwen2.5-VL-7B-Instruct",
   "model_info": "ModelInfo(model_type='qwen2_5_vl', model_dir='/mllm_hdd/yfzhang/data/models--Qwen--Qwen2.5-VL-7B-Instruct/snapshots/cc594898137f460bfe9f0759e9844b3ce807cfb5', torch_dtype=torch.bfloat16, max_model_len=128000, quant_method=None, quant_bits=None, rope_scaling={'type': 'default', 'mrope_section': [16, 24, 24], 'rope_type': 'default'}, config=None, task_type='causal_lm', num_labels=None)",
-  "model_meta": "ModelMeta(model_type='qwen2_5_vl', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5_vl', get_function=<function get_model_tokenizer_qwen2_5_vl at 0x7fab5d5ace50>, model_arch='qwen2_vl', architectures=['Qwen2_5_VLForConditionalGeneration'], additional_saved_files=[], torch_dtype=None, is_multimodal=True, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.49', 'qwen_vl_utils>=0.0.6', 'decord'], tags=[])",
   "model_dir": "/mllm_hdd/yfzhang/data/models--Qwen--Qwen2.5-VL-7B-Instruct/snapshots/cc594898137f460bfe9f0759e9844b3ce807cfb5",
   "hub": "<class 'swift.hub.hub.MSHub'>",
   "evaluation_strategy": "epoch",
-  "training_args": "Seq2SeqTrainingArguments(output_dir='/mmu_mllm_hdd_2/yifanzhang/models/qwen_tool_all_data_200k_w_imagesize_3epoch_4096_all_2round_code_4ofilter1/v6-20250611-100511', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=16, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=1e-05, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/mmu_mllm_hdd_2/yifanzhang/models/qwen_tool_all_data_200k_w_imagesize_3epoch_4096_all_2round_code_4ofilter1/v6-20250611-100511/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.EPOCH: 'epoch'>, save_steps=500, save_total_limit=5, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=None, dataloader_num_workers=0, dataloader_prefetch_factor=None, past_index=-1, run_name='/mmu_mllm_hdd_2/yifanzhang/models/qwen_tool_all_data_200k_w_imagesize_3epoch_4096_all_2round_code_4ofilter1/v6-20250611-100511', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, tp_size=0, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 2, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'allgather_partitions': True, 'allgather_bucket_size': 200000000.0, 'overlap_comm': False, 'reduce_scatter': True, 'reduce_bucket_size': 200000000.0, 'contiguous_gradients': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=[], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint='/mmu_mllm_hdd_2/yifanzhang/models/qwen_tool_all_data_200k_w_imagesize_3epoch_4096_all_2round_code_4ofilter1/v4-20250610-121125/checkpoint-2477', hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, optimizer=None, metric_warmup_step=0, fsdp_num=1, acc_steps=1, eval_use_evalscope=False, eval_datasets=[], eval_limit=None, eval_datasets_args=None, eval_generation_config=None, train_type='full', local_repo_path=None, galore_config=None)"
 }

   "init_strategy": null,
   "template": "qwen2_5_vl",
   "system": "You are a helpful assistant.\n\nSolve the following problem step by step, and optionally write Python code for image manipulation to enhance your reasoning process. The Python code will be executed by an external sandbox, and the processed image or result (wrapped in <sandbox_output></sandbox_output>) can be returned to aid your reasoning and help you arrive at the final answer.\n\n**Reasoning & Image Manipulation (Optional but Encouraged):**\n    * You have the capability to write executable Python code to perform image manipulations (e.g., cropping to a Region of Interest (ROI), resizing, rotation, adjusting contrast) or perform calculation for better reasoning.\n    * The code will be executed in a secure sandbox, and its output will be provided back to you for further analysis.\n    * All Python code snippets **must** be wrapped as follows:\n    <code>\n    ```python\n    # your code.\n    ```\n    </code>\n    * At the end of the code, print the path of the processed image (processed_path) or the result for further processing in a sandbox environment.",
+  "max_length": 10240,
   "truncation_strategy": "delete",
   "max_pixels": null,
   "agent_template": null,
   "use_chat_template": true,
   "template_backend": "swift",
   "dataset": [
+    "/mmu_mllm_hdd_2/yifanzhang/agent_latest_code/scripts/training_data/wo_system_image_180k_filter_w_image_size_filter_wo_some_code.jsonl",
     "/mllm_hdd/yfzhang/Agent-R1/agent_latest_code/scripts/training_data/vstar_2step_nooverlap_training_filtered_70k.jsonl",
     "/mllm_hdd/yfzhang/Agent-R1/agent_latest_code/scripts/training_data/vstar_2step_zoomin_training_filtered_70k.jsonl",
+    "/mllm_hdd/yfzhang/Agent-R1/agent_latest_code/scripts/training_data/wo_system_image_180k_filter_w_image_size_filter_wo_some_code_2round.jsonl"
   ],
   "val_dataset": [],
   "split_dataset_ratio": 0.01,
   "ddp_backend": null,
   "ignore_args_error": false,
   "use_swift_lora": false,
+  "output_dir": "/mmu_mllm_hdd_2/yifanzhang/models/tool_final/qwen_tool_all_data_180k_alldata_wpgemini/v0-20250616-170052",
   "overwrite_output_dir": false,
   "do_train": false,
   "do_eval": false,
   "log_level": "passive",
   "log_level_replica": "warning",
   "log_on_each_node": true,
+  "logging_dir": "/mmu_mllm_hdd_2/yifanzhang/models/tool_final/qwen_tool_all_data_180k_alldata_wpgemini/v0-20250616-170052/runs",
   "logging_strategy": "steps",
   "logging_first_step": true,
   "logging_steps": 5,
   "debug": null,
   "dataloader_drop_last": false,
   "eval_steps": null,
+  "dataloader_num_workers": 4,
   "dataloader_prefetch_factor": null,
   "past_index": -1,
+  "run_name": "/mmu_mllm_hdd_2/yifanzhang/models/tool_final/qwen_tool_all_data_180k_alldata_wpgemini/v0-20250616-170052",
   "disable_tqdm": null,
   "label_names": null,
   "load_best_model_at_end": false,
   "group_by_length": false,
   "length_column_name": "length",
   "report_to": [
+    "wandb"
   ],
   "ddp_find_unused_parameters": null,
   "ddp_bucket_cap_mb": null,
   "skip_memory_metrics": true,
   "use_legacy_prediction_loop": false,
   "push_to_hub": false,
+  "resume_from_checkpoint": null,
   "hub_model_id": null,
   "hub_strategy": "every_save",
   "hub_private_repo": null,
   "local_world_size": 8,
   "model_suffix": "Qwen2.5-VL-7B-Instruct",
   "model_info": "ModelInfo(model_type='qwen2_5_vl', model_dir='/mllm_hdd/yfzhang/data/models--Qwen--Qwen2.5-VL-7B-Instruct/snapshots/cc594898137f460bfe9f0759e9844b3ce807cfb5', torch_dtype=torch.bfloat16, max_model_len=128000, quant_method=None, quant_bits=None, rope_scaling={'type': 'default', 'mrope_section': [16, 24, 24], 'rope_type': 'default'}, config=None, task_type='causal_lm', num_labels=None)",
+  "model_meta": "ModelMeta(model_type='qwen2_5_vl', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5_vl', get_function=<function get_model_tokenizer_qwen2_5_vl at 0x7f24290f93f0>, model_arch='qwen2_vl', architectures=['Qwen2_5_VLForConditionalGeneration'], additional_saved_files=[], torch_dtype=None, is_multimodal=True, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.49', 'qwen_vl_utils>=0.0.6', 'decord'], tags=[])",
   "model_dir": "/mllm_hdd/yfzhang/data/models--Qwen--Qwen2.5-VL-7B-Instruct/snapshots/cc594898137f460bfe9f0759e9844b3ce807cfb5",
   "hub": "<class 'swift.hub.hub.MSHub'>",
   "evaluation_strategy": "epoch",
+  "training_args": "Seq2SeqTrainingArguments(output_dir='/mmu_mllm_hdd_2/yifanzhang/models/tool_final/qwen_tool_all_data_180k_alldata_wpgemini/v0-20250616-170052', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=16, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=1e-05, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/mmu_mllm_hdd_2/yifanzhang/models/tool_final/qwen_tool_all_data_180k_alldata_wpgemini/v0-20250616-170052/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.EPOCH: 'epoch'>, save_steps=500, save_total_limit=5, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=None, dataloader_num_workers=4, dataloader_prefetch_factor=10, past_index=-1, run_name='/mmu_mllm_hdd_2/yifanzhang/models/tool_final/qwen_tool_all_data_180k_alldata_wpgemini/v0-20250616-170052', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, tp_size=0, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 2, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'allgather_partitions': True, 'allgather_bucket_size': 200000000.0, 'overlap_comm': False, 'reduce_scatter': True, 'reduce_bucket_size': 200000000.0, 'contiguous_gradients': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['wandb'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, optimizer=None, metric_warmup_step=0, fsdp_num=1, acc_steps=1, eval_use_evalscope=False, eval_datasets=[], eval_limit=None, eval_datasets_args=None, eval_generation_config=None, train_type='full', local_repo_path=None, galore_config=None)"
 }

global_step190/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a4903ad115f1781a3edb94ab79f40c6a3f93dd4ca21281cf24f3e88e63f3787
+size 11423432396

global_step190/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1ab83ce8fdb9ac4b4fc0ca1882154c9641aacf81f8a927303d8f58b11e55cd11
+size 11423433356

global_step190/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d94c8b1026b52a76bbc2b5374078fc3effe97c91fe9a005c8e59018a1f6c9e6a
+size 11423433484

global_step190/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:72a5c483c3d8b87af94a89e790569ef73775356b1b708f3389f404515d5c6f4c
+size 11423433484

global_step190/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f37f8310b08ff06d9e444b866683020d05af5b16922c124b402543986809e1c3
+size 11423433804

global_step190/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8aba1f366bef9718320eb5ea41a3adeb51c0faefafce2fe1279ca21e5398a9d0
+size 11423433484

global_step190/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:15348b3d37035573222d7e44bc7b95f9c17645567c16d98a809d45c3b84a0481
+size 11423433548

global_step190/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f1b280d0de63d145ed723c43c1dd68f7b8197c8f793ef24dcdf839e39725505
+size 11423432012

global_step190/mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:66e8390a7702e571fa6029e8cdb5bb672474458b5ea3c25162f8cbcfcf4f6d2d
+size 17937826134

latest CHANGED Viewed

	@@ -1 +1 @@
1	- ~~global_step4954~~


1	+ global_step190

model-00001-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4c79dd91ba67f29d8710a0839b4df9ef9d3ecd4460e937884cac376012d319cd
 size 4968243304

 version https://git-lfs.github.com/spec/v1
+oid sha256:35dea09eb661006336ce549434a1307ff409722a6cf0a69c0e2fa6e6af2371df
 size 4968243304

model-00002-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:90da2b983f49ee3bb33be4815804e8bff2c994336232e03fed786a7c4f554322
 size 4991495816

 version https://git-lfs.github.com/spec/v1
+oid sha256:92f8355328bb494f9860ad8458b735e7b6db1ad2e0af409acb0f30586872a81b
 size 4991495816

model-00003-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6401cf110c22a98dcbcaa98a89baed6b1ffa9bb0c819094b821c59d1c48dc0a9
 size 4932751040

 version https://git-lfs.github.com/spec/v1
+oid sha256:9e90769ec71fd83d560abd732e2ca5d89c9df92145da2850d057c29a9fcffbce
 size 4932751040

model-00004-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:91d1a621ee97e55e06b1cdeb53880e3949611d02ccc480d7b213d46b5fd9a84f
 size 1691924384

 version https://git-lfs.github.com/spec/v1
+oid sha256:51eee4b151ea3d344bc810054c869d7fac286f9bff5ba4f2cd5deda76f0ccd2f
 size 1691924384

rng_state_0.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:36d2a2034ebb05cb71c510897f2795b31164e50f17b270bc25d2be3ad9a17b22
-size 15984

 version https://git-lfs.github.com/spec/v1
+oid sha256:5ff22c72cb50d4b6353cf336b950d2c54115c739606544de9a8d3b0fab0ef188
+size 15920

rng_state_1.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:060dfdb1c49102cbdc8868a6031e68787601b4ccd782f3fb9b137e20c1fd2c7a
-size 15984

 version https://git-lfs.github.com/spec/v1
+oid sha256:c253ad05d3ffa98dc1e5291ec640e6158218602254206e6fb97ea82185040775
+size 15920

rng_state_2.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:af01895cb66e616591f2e4baa8dcd8151530eab133c73571ccb31c74f35422ce
-size 15984

 version https://git-lfs.github.com/spec/v1
+oid sha256:edcdbb53b869133655de3ceb14325bb161e7aa238640244125393ff4afcb7363
+size 15920

rng_state_3.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:677921992b1e0cef3aee776f245975003d22f51d9bd6ed20f248ded1deb72fa9
-size 15984

 version https://git-lfs.github.com/spec/v1
+oid sha256:b095e9807a468e05241a1d389fd326265bee89778655d3c00298c2f6abcee791
+size 15920

rng_state_4.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d69353c629541c690c5471f8ec05fdab2bfecf3d37afaa436bc45939da6db68f
-size 15984

 version https://git-lfs.github.com/spec/v1
+oid sha256:a4da3805c54028ef89f9cce32b640836c54b432b251fae2dea3d0182fa96f403
+size 15920

rng_state_5.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8e40ba6668cc03c9162c68a933d164bf38ae2d196a9a6fec03ae615491201185
-size 15984

 version https://git-lfs.github.com/spec/v1
+oid sha256:08bb3c8d2c1053a6b93a1342acb589ea4695b3fdb4dfeaf3675191c7e3390d3d
+size 15920

rng_state_6.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:870968fea834e24b2e099cf3e4fe1e3fb8caf38d8f8e5b790d7d47386d4d05f5
-size 15984

 version https://git-lfs.github.com/spec/v1
+oid sha256:3cfebda0f5a673869a40a8a82d7e308c843f9b55ec287861a6a0670b4c43c9d4
+size 15920

rng_state_7.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e9e19618bee7c6ef43256fea25abe19bca88535eb1e7dc213cde8929ae4e8180
-size 15984

 version https://git-lfs.github.com/spec/v1
+oid sha256:34bc6aae481e0bf889f93cc019a7692fbdaba7925f341ec202dd711aa1d2a724
+size 15920

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:60d4ee630da491260156b1eb0cb5f833bea3b6c7bc13b43d5f68f80d26b81175
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:d702a8a3f8a6bd01d9c0b9b47d00fb13493b6625b2b130d83744041615bf59ad
 size 1064

trainer_state.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b4724c8db9bbee65453bfd76967efff3e6f43c5d47668efd8102b96225089149
-size 8120

 version https://git-lfs.github.com/spec/v1
+oid sha256:dd32b1597af89117014632dc1c0dfbea5e1e857567df2a15713b7750a4acf718
+size 8056