yifanzhang114 commited on
Commit
bfe2928
·
verified ·
1 Parent(s): 0e5943e

Upload folder using huggingface_hub

Browse files
args.json CHANGED
@@ -14,7 +14,7 @@
14
  "init_strategy": null,
15
  "template": "qwen2_5_vl",
16
  "system": "You are a helpful assistant.\n\nSolve the following problem step by step, and optionally write Python code for image manipulation to enhance your reasoning process. The Python code will be executed by an external sandbox, and the processed image or result (wrapped in <sandbox_output></sandbox_output>) can be returned to aid your reasoning and help you arrive at the final answer.\n\n**Reasoning & Image Manipulation (Optional but Encouraged):**\n * You have the capability to write executable Python code to perform image manipulations (e.g., cropping to a Region of Interest (ROI), resizing, rotation, adjusting contrast) or perform calculation for better reasoning.\n * The code will be executed in a secure sandbox, and its output will be provided back to you for further analysis.\n * All Python code snippets **must** be wrapped as follows:\n <code>\n ```python\n # your code.\n ```\n </code>\n * At the end of the code, print the path of the processed image (processed_path) or the result for further processing in a sandbox environment.",
17
- "max_length": 20480,
18
  "truncation_strategy": "delete",
19
  "max_pixels": null,
20
  "agent_template": null,
@@ -27,11 +27,10 @@
27
  "use_chat_template": true,
28
  "template_backend": "swift",
29
  "dataset": [
30
- "/mllm_hdd/yfzhang/Agent-R1/agent_latest_code/scripts/training_data/wo_system_image_180k_filter_w_image_size_filter_wo_some_code_4o_filter_1.jsonl",
31
  "/mllm_hdd/yfzhang/Agent-R1/agent_latest_code/scripts/training_data/vstar_2step_nooverlap_training_filtered_70k.jsonl",
32
  "/mllm_hdd/yfzhang/Agent-R1/agent_latest_code/scripts/training_data/vstar_2step_zoomin_training_filtered_70k.jsonl",
33
- "/mllm_hdd/yfzhang/Agent-R1/agent_latest_code/scripts/training_data/wo_system_image_180k_filter_w_image_size_filter_wo_some_code_2round.jsonl",
34
- "/mllm_hdd/yfzhang/Agent-R1/construct_data/math_and_chart/mm-eumath/data_gemini_code_processed_swift_train_chunk0_of_1.jsonl"
35
  ],
36
  "val_dataset": [],
37
  "split_dataset_ratio": 0.01,
@@ -92,7 +91,7 @@
92
  "ddp_backend": null,
93
  "ignore_args_error": false,
94
  "use_swift_lora": false,
95
- "output_dir": "/mmu_mllm_hdd_2/yifanzhang/models/qwen_tool_all_data_200k_w_imagesize_3epoch_4096_all_2round_code_4ofilter1/v6-20250611-100511",
96
  "overwrite_output_dir": false,
97
  "do_train": false,
98
  "do_eval": false,
@@ -122,7 +121,7 @@
122
  "log_level": "passive",
123
  "log_level_replica": "warning",
124
  "log_on_each_node": true,
125
- "logging_dir": "/mmu_mllm_hdd_2/yifanzhang/models/qwen_tool_all_data_200k_w_imagesize_3epoch_4096_all_2round_code_4ofilter1/v6-20250611-100511/runs",
126
  "logging_strategy": "steps",
127
  "logging_first_step": true,
128
  "logging_steps": 5,
@@ -152,10 +151,10 @@
152
  "debug": null,
153
  "dataloader_drop_last": false,
154
  "eval_steps": null,
155
- "dataloader_num_workers": 0,
156
  "dataloader_prefetch_factor": null,
157
  "past_index": -1,
158
- "run_name": "/mmu_mllm_hdd_2/yifanzhang/models/qwen_tool_all_data_200k_w_imagesize_3epoch_4096_all_2round_code_4ofilter1/v6-20250611-100511",
159
  "disable_tqdm": null,
160
  "label_names": null,
161
  "load_best_model_at_end": false,
@@ -209,7 +208,7 @@
209
  "group_by_length": false,
210
  "length_column_name": "length",
211
  "report_to": [
212
- "none"
213
  ],
214
  "ddp_find_unused_parameters": null,
215
  "ddp_bucket_cap_mb": null,
@@ -219,7 +218,7 @@
219
  "skip_memory_metrics": true,
220
  "use_legacy_prediction_loop": false,
221
  "push_to_hub": false,
222
- "resume_from_checkpoint": "/mmu_mllm_hdd_2/yifanzhang/models/qwen_tool_all_data_200k_w_imagesize_3epoch_4096_all_2round_code_4ofilter1/v4-20250610-121125/checkpoint-2477",
223
  "hub_model_id": null,
224
  "hub_strategy": "every_save",
225
  "hub_private_repo": null,
@@ -363,9 +362,9 @@
363
  "local_world_size": 8,
364
  "model_suffix": "Qwen2.5-VL-7B-Instruct",
365
  "model_info": "ModelInfo(model_type='qwen2_5_vl', model_dir='/mllm_hdd/yfzhang/data/models--Qwen--Qwen2.5-VL-7B-Instruct/snapshots/cc594898137f460bfe9f0759e9844b3ce807cfb5', torch_dtype=torch.bfloat16, max_model_len=128000, quant_method=None, quant_bits=None, rope_scaling={'type': 'default', 'mrope_section': [16, 24, 24], 'rope_type': 'default'}, config=None, task_type='causal_lm', num_labels=None)",
366
- "model_meta": "ModelMeta(model_type='qwen2_5_vl', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5_vl', get_function=<function get_model_tokenizer_qwen2_5_vl at 0x7fab5d5ace50>, model_arch='qwen2_vl', architectures=['Qwen2_5_VLForConditionalGeneration'], additional_saved_files=[], torch_dtype=None, is_multimodal=True, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.49', 'qwen_vl_utils>=0.0.6', 'decord'], tags=[])",
367
  "model_dir": "/mllm_hdd/yfzhang/data/models--Qwen--Qwen2.5-VL-7B-Instruct/snapshots/cc594898137f460bfe9f0759e9844b3ce807cfb5",
368
  "hub": "<class 'swift.hub.hub.MSHub'>",
369
  "evaluation_strategy": "epoch",
370
- "training_args": "Seq2SeqTrainingArguments(output_dir='/mmu_mllm_hdd_2/yifanzhang/models/qwen_tool_all_data_200k_w_imagesize_3epoch_4096_all_2round_code_4ofilter1/v6-20250611-100511', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=16, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=1e-05, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/mmu_mllm_hdd_2/yifanzhang/models/qwen_tool_all_data_200k_w_imagesize_3epoch_4096_all_2round_code_4ofilter1/v6-20250611-100511/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.EPOCH: 'epoch'>, save_steps=500, save_total_limit=5, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=None, dataloader_num_workers=0, dataloader_prefetch_factor=None, past_index=-1, run_name='/mmu_mllm_hdd_2/yifanzhang/models/qwen_tool_all_data_200k_w_imagesize_3epoch_4096_all_2round_code_4ofilter1/v6-20250611-100511', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, tp_size=0, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 2, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'allgather_partitions': True, 'allgather_bucket_size': 200000000.0, 'overlap_comm': False, 'reduce_scatter': True, 'reduce_bucket_size': 200000000.0, 'contiguous_gradients': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=[], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint='/mmu_mllm_hdd_2/yifanzhang/models/qwen_tool_all_data_200k_w_imagesize_3epoch_4096_all_2round_code_4ofilter1/v4-20250610-121125/checkpoint-2477', hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, optimizer=None, metric_warmup_step=0, fsdp_num=1, acc_steps=1, eval_use_evalscope=False, eval_datasets=[], eval_limit=None, eval_datasets_args=None, eval_generation_config=None, train_type='full', local_repo_path=None, galore_config=None)"
371
  }
 
14
  "init_strategy": null,
15
  "template": "qwen2_5_vl",
16
  "system": "You are a helpful assistant.\n\nSolve the following problem step by step, and optionally write Python code for image manipulation to enhance your reasoning process. The Python code will be executed by an external sandbox, and the processed image or result (wrapped in <sandbox_output></sandbox_output>) can be returned to aid your reasoning and help you arrive at the final answer.\n\n**Reasoning & Image Manipulation (Optional but Encouraged):**\n * You have the capability to write executable Python code to perform image manipulations (e.g., cropping to a Region of Interest (ROI), resizing, rotation, adjusting contrast) or perform calculation for better reasoning.\n * The code will be executed in a secure sandbox, and its output will be provided back to you for further analysis.\n * All Python code snippets **must** be wrapped as follows:\n <code>\n ```python\n # your code.\n ```\n </code>\n * At the end of the code, print the path of the processed image (processed_path) or the result for further processing in a sandbox environment.",
17
+ "max_length": 10240,
18
  "truncation_strategy": "delete",
19
  "max_pixels": null,
20
  "agent_template": null,
 
27
  "use_chat_template": true,
28
  "template_backend": "swift",
29
  "dataset": [
30
+ "/mmu_mllm_hdd_2/yifanzhang/agent_latest_code/scripts/training_data/wo_system_image_180k_filter_w_image_size_filter_wo_some_code.jsonl",
31
  "/mllm_hdd/yfzhang/Agent-R1/agent_latest_code/scripts/training_data/vstar_2step_nooverlap_training_filtered_70k.jsonl",
32
  "/mllm_hdd/yfzhang/Agent-R1/agent_latest_code/scripts/training_data/vstar_2step_zoomin_training_filtered_70k.jsonl",
33
+ "/mllm_hdd/yfzhang/Agent-R1/agent_latest_code/scripts/training_data/wo_system_image_180k_filter_w_image_size_filter_wo_some_code_2round.jsonl"
 
34
  ],
35
  "val_dataset": [],
36
  "split_dataset_ratio": 0.01,
 
91
  "ddp_backend": null,
92
  "ignore_args_error": false,
93
  "use_swift_lora": false,
94
+ "output_dir": "/mmu_mllm_hdd_2/yifanzhang/models/tool_final/qwen_tool_all_data_180k_alldata_wpgemini/v0-20250616-170052",
95
  "overwrite_output_dir": false,
96
  "do_train": false,
97
  "do_eval": false,
 
121
  "log_level": "passive",
122
  "log_level_replica": "warning",
123
  "log_on_each_node": true,
124
+ "logging_dir": "/mmu_mllm_hdd_2/yifanzhang/models/tool_final/qwen_tool_all_data_180k_alldata_wpgemini/v0-20250616-170052/runs",
125
  "logging_strategy": "steps",
126
  "logging_first_step": true,
127
  "logging_steps": 5,
 
151
  "debug": null,
152
  "dataloader_drop_last": false,
153
  "eval_steps": null,
154
+ "dataloader_num_workers": 4,
155
  "dataloader_prefetch_factor": null,
156
  "past_index": -1,
157
+ "run_name": "/mmu_mllm_hdd_2/yifanzhang/models/tool_final/qwen_tool_all_data_180k_alldata_wpgemini/v0-20250616-170052",
158
  "disable_tqdm": null,
159
  "label_names": null,
160
  "load_best_model_at_end": false,
 
208
  "group_by_length": false,
209
  "length_column_name": "length",
210
  "report_to": [
211
+ "wandb"
212
  ],
213
  "ddp_find_unused_parameters": null,
214
  "ddp_bucket_cap_mb": null,
 
218
  "skip_memory_metrics": true,
219
  "use_legacy_prediction_loop": false,
220
  "push_to_hub": false,
221
+ "resume_from_checkpoint": null,
222
  "hub_model_id": null,
223
  "hub_strategy": "every_save",
224
  "hub_private_repo": null,
 
362
  "local_world_size": 8,
363
  "model_suffix": "Qwen2.5-VL-7B-Instruct",
364
  "model_info": "ModelInfo(model_type='qwen2_5_vl', model_dir='/mllm_hdd/yfzhang/data/models--Qwen--Qwen2.5-VL-7B-Instruct/snapshots/cc594898137f460bfe9f0759e9844b3ce807cfb5', torch_dtype=torch.bfloat16, max_model_len=128000, quant_method=None, quant_bits=None, rope_scaling={'type': 'default', 'mrope_section': [16, 24, 24], 'rope_type': 'default'}, config=None, task_type='causal_lm', num_labels=None)",
365
+ "model_meta": "ModelMeta(model_type='qwen2_5_vl', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5_vl', get_function=<function get_model_tokenizer_qwen2_5_vl at 0x7f24290f93f0>, model_arch='qwen2_vl', architectures=['Qwen2_5_VLForConditionalGeneration'], additional_saved_files=[], torch_dtype=None, is_multimodal=True, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.49', 'qwen_vl_utils>=0.0.6', 'decord'], tags=[])",
366
  "model_dir": "/mllm_hdd/yfzhang/data/models--Qwen--Qwen2.5-VL-7B-Instruct/snapshots/cc594898137f460bfe9f0759e9844b3ce807cfb5",
367
  "hub": "<class 'swift.hub.hub.MSHub'>",
368
  "evaluation_strategy": "epoch",
369
+ "training_args": "Seq2SeqTrainingArguments(output_dir='/mmu_mllm_hdd_2/yifanzhang/models/tool_final/qwen_tool_all_data_180k_alldata_wpgemini/v0-20250616-170052', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=16, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=1e-05, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/mmu_mllm_hdd_2/yifanzhang/models/tool_final/qwen_tool_all_data_180k_alldata_wpgemini/v0-20250616-170052/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.EPOCH: 'epoch'>, save_steps=500, save_total_limit=5, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=None, dataloader_num_workers=4, dataloader_prefetch_factor=10, past_index=-1, run_name='/mmu_mllm_hdd_2/yifanzhang/models/tool_final/qwen_tool_all_data_180k_alldata_wpgemini/v0-20250616-170052', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, tp_size=0, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 2, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'allgather_partitions': True, 'allgather_bucket_size': 200000000.0, 'overlap_comm': False, 'reduce_scatter': True, 'reduce_bucket_size': 200000000.0, 'contiguous_gradients': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['wandb'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, optimizer=None, metric_warmup_step=0, fsdp_num=1, acc_steps=1, eval_use_evalscope=False, eval_datasets=[], eval_limit=None, eval_datasets_args=None, eval_generation_config=None, train_type='full', local_repo_path=None, galore_config=None)"
370
  }
global_step190/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a4903ad115f1781a3edb94ab79f40c6a3f93dd4ca21281cf24f3e88e63f3787
3
+ size 11423432396
global_step190/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ab83ce8fdb9ac4b4fc0ca1882154c9641aacf81f8a927303d8f58b11e55cd11
3
+ size 11423433356
global_step190/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d94c8b1026b52a76bbc2b5374078fc3effe97c91fe9a005c8e59018a1f6c9e6a
3
+ size 11423433484
global_step190/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72a5c483c3d8b87af94a89e790569ef73775356b1b708f3389f404515d5c6f4c
3
+ size 11423433484
global_step190/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f37f8310b08ff06d9e444b866683020d05af5b16922c124b402543986809e1c3
3
+ size 11423433804
global_step190/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8aba1f366bef9718320eb5ea41a3adeb51c0faefafce2fe1279ca21e5398a9d0
3
+ size 11423433484
global_step190/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15348b3d37035573222d7e44bc7b95f9c17645567c16d98a809d45c3b84a0481
3
+ size 11423433548
global_step190/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f1b280d0de63d145ed723c43c1dd68f7b8197c8f793ef24dcdf839e39725505
3
+ size 11423432012
global_step190/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66e8390a7702e571fa6029e8cdb5bb672474458b5ea3c25162f8cbcfcf4f6d2d
3
+ size 17937826134
latest CHANGED
@@ -1 +1 @@
1
- global_step4954
 
1
+ global_step190
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c79dd91ba67f29d8710a0839b4df9ef9d3ecd4460e937884cac376012d319cd
3
  size 4968243304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35dea09eb661006336ce549434a1307ff409722a6cf0a69c0e2fa6e6af2371df
3
  size 4968243304
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:90da2b983f49ee3bb33be4815804e8bff2c994336232e03fed786a7c4f554322
3
  size 4991495816
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92f8355328bb494f9860ad8458b735e7b6db1ad2e0af409acb0f30586872a81b
3
  size 4991495816
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6401cf110c22a98dcbcaa98a89baed6b1ffa9bb0c819094b821c59d1c48dc0a9
3
  size 4932751040
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e90769ec71fd83d560abd732e2ca5d89c9df92145da2850d057c29a9fcffbce
3
  size 4932751040
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:91d1a621ee97e55e06b1cdeb53880e3949611d02ccc480d7b213d46b5fd9a84f
3
  size 1691924384
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51eee4b151ea3d344bc810054c869d7fac286f9bff5ba4f2cd5deda76f0ccd2f
3
  size 1691924384
rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36d2a2034ebb05cb71c510897f2795b31164e50f17b270bc25d2be3ad9a17b22
3
- size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ff22c72cb50d4b6353cf336b950d2c54115c739606544de9a8d3b0fab0ef188
3
+ size 15920
rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:060dfdb1c49102cbdc8868a6031e68787601b4ccd782f3fb9b137e20c1fd2c7a
3
- size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c253ad05d3ffa98dc1e5291ec640e6158218602254206e6fb97ea82185040775
3
+ size 15920
rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:af01895cb66e616591f2e4baa8dcd8151530eab133c73571ccb31c74f35422ce
3
- size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edcdbb53b869133655de3ceb14325bb161e7aa238640244125393ff4afcb7363
3
+ size 15920
rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:677921992b1e0cef3aee776f245975003d22f51d9bd6ed20f248ded1deb72fa9
3
- size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b095e9807a468e05241a1d389fd326265bee89778655d3c00298c2f6abcee791
3
+ size 15920
rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d69353c629541c690c5471f8ec05fdab2bfecf3d37afaa436bc45939da6db68f
3
- size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4da3805c54028ef89f9cce32b640836c54b432b251fae2dea3d0182fa96f403
3
+ size 15920
rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e40ba6668cc03c9162c68a933d164bf38ae2d196a9a6fec03ae615491201185
3
- size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08bb3c8d2c1053a6b93a1342acb589ea4695b3fdb4dfeaf3675191c7e3390d3d
3
+ size 15920
rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:870968fea834e24b2e099cf3e4fe1e3fb8caf38d8f8e5b790d7d47386d4d05f5
3
- size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cfebda0f5a673869a40a8a82d7e308c843f9b55ec287861a6a0670b4c43c9d4
3
+ size 15920
rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e9e19618bee7c6ef43256fea25abe19bca88535eb1e7dc213cde8929ae4e8180
3
- size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34bc6aae481e0bf889f93cc019a7692fbdaba7925f341ec202dd711aa1d2a724
3
+ size 15920
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:60d4ee630da491260156b1eb0cb5f833bea3b6c7bc13b43d5f68f80d26b81175
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d702a8a3f8a6bd01d9c0b9b47d00fb13493b6625b2b130d83744041615bf59ad
3
  size 1064
trainer_state.json CHANGED
The diff for this file is too large to render. See raw diff
 
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b4724c8db9bbee65453bfd76967efff3e6f43c5d47668efd8102b96225089149
3
- size 8120
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd32b1597af89117014632dc1c0dfbea5e1e857567df2a15713b7750a4acf718
3
+ size 8056