Upload folder using huggingface_hub
Browse files- args.json +11 -12
- global_step190/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
- global_step190/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
- global_step190/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
- global_step190/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
- global_step190/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
- global_step190/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
- global_step190/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
- global_step190/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
- global_step190/mp_rank_00_model_states.pt +3 -0
- latest +1 -1
- model-00001-of-00004.safetensors +1 -1
- model-00002-of-00004.safetensors +1 -1
- model-00003-of-00004.safetensors +1 -1
- model-00004-of-00004.safetensors +1 -1
- rng_state_0.pth +2 -2
- rng_state_1.pth +2 -2
- rng_state_2.pth +2 -2
- rng_state_3.pth +2 -2
- rng_state_4.pth +2 -2
- rng_state_5.pth +2 -2
- rng_state_6.pth +2 -2
- rng_state_7.pth +2 -2
- scheduler.pt +1 -1
- trainer_state.json +0 -0
- training_args.bin +2 -2
args.json
CHANGED
@@ -14,7 +14,7 @@
|
|
14 |
"init_strategy": null,
|
15 |
"template": "qwen2_5_vl",
|
16 |
"system": "You are a helpful assistant.\n\nSolve the following problem step by step, and optionally write Python code for image manipulation to enhance your reasoning process. The Python code will be executed by an external sandbox, and the processed image or result (wrapped in <sandbox_output></sandbox_output>) can be returned to aid your reasoning and help you arrive at the final answer.\n\n**Reasoning & Image Manipulation (Optional but Encouraged):**\n * You have the capability to write executable Python code to perform image manipulations (e.g., cropping to a Region of Interest (ROI), resizing, rotation, adjusting contrast) or perform calculation for better reasoning.\n * The code will be executed in a secure sandbox, and its output will be provided back to you for further analysis.\n * All Python code snippets **must** be wrapped as follows:\n <code>\n ```python\n # your code.\n ```\n </code>\n * At the end of the code, print the path of the processed image (processed_path) or the result for further processing in a sandbox environment.",
|
17 |
-
"max_length":
|
18 |
"truncation_strategy": "delete",
|
19 |
"max_pixels": null,
|
20 |
"agent_template": null,
|
@@ -27,11 +27,10 @@
|
|
27 |
"use_chat_template": true,
|
28 |
"template_backend": "swift",
|
29 |
"dataset": [
|
30 |
-
"/
|
31 |
"/mllm_hdd/yfzhang/Agent-R1/agent_latest_code/scripts/training_data/vstar_2step_nooverlap_training_filtered_70k.jsonl",
|
32 |
"/mllm_hdd/yfzhang/Agent-R1/agent_latest_code/scripts/training_data/vstar_2step_zoomin_training_filtered_70k.jsonl",
|
33 |
-
"/mllm_hdd/yfzhang/Agent-R1/agent_latest_code/scripts/training_data/wo_system_image_180k_filter_w_image_size_filter_wo_some_code_2round.jsonl"
|
34 |
-
"/mllm_hdd/yfzhang/Agent-R1/construct_data/math_and_chart/mm-eumath/data_gemini_code_processed_swift_train_chunk0_of_1.jsonl"
|
35 |
],
|
36 |
"val_dataset": [],
|
37 |
"split_dataset_ratio": 0.01,
|
@@ -92,7 +91,7 @@
|
|
92 |
"ddp_backend": null,
|
93 |
"ignore_args_error": false,
|
94 |
"use_swift_lora": false,
|
95 |
-
"output_dir": "/mmu_mllm_hdd_2/yifanzhang/models/
|
96 |
"overwrite_output_dir": false,
|
97 |
"do_train": false,
|
98 |
"do_eval": false,
|
@@ -122,7 +121,7 @@
|
|
122 |
"log_level": "passive",
|
123 |
"log_level_replica": "warning",
|
124 |
"log_on_each_node": true,
|
125 |
-
"logging_dir": "/mmu_mllm_hdd_2/yifanzhang/models/
|
126 |
"logging_strategy": "steps",
|
127 |
"logging_first_step": true,
|
128 |
"logging_steps": 5,
|
@@ -152,10 +151,10 @@
|
|
152 |
"debug": null,
|
153 |
"dataloader_drop_last": false,
|
154 |
"eval_steps": null,
|
155 |
-
"dataloader_num_workers":
|
156 |
"dataloader_prefetch_factor": null,
|
157 |
"past_index": -1,
|
158 |
-
"run_name": "/mmu_mllm_hdd_2/yifanzhang/models/
|
159 |
"disable_tqdm": null,
|
160 |
"label_names": null,
|
161 |
"load_best_model_at_end": false,
|
@@ -209,7 +208,7 @@
|
|
209 |
"group_by_length": false,
|
210 |
"length_column_name": "length",
|
211 |
"report_to": [
|
212 |
-
"
|
213 |
],
|
214 |
"ddp_find_unused_parameters": null,
|
215 |
"ddp_bucket_cap_mb": null,
|
@@ -219,7 +218,7 @@
|
|
219 |
"skip_memory_metrics": true,
|
220 |
"use_legacy_prediction_loop": false,
|
221 |
"push_to_hub": false,
|
222 |
-
"resume_from_checkpoint":
|
223 |
"hub_model_id": null,
|
224 |
"hub_strategy": "every_save",
|
225 |
"hub_private_repo": null,
|
@@ -363,9 +362,9 @@
|
|
363 |
"local_world_size": 8,
|
364 |
"model_suffix": "Qwen2.5-VL-7B-Instruct",
|
365 |
"model_info": "ModelInfo(model_type='qwen2_5_vl', model_dir='/mllm_hdd/yfzhang/data/models--Qwen--Qwen2.5-VL-7B-Instruct/snapshots/cc594898137f460bfe9f0759e9844b3ce807cfb5', torch_dtype=torch.bfloat16, max_model_len=128000, quant_method=None, quant_bits=None, rope_scaling={'type': 'default', 'mrope_section': [16, 24, 24], 'rope_type': 'default'}, config=None, task_type='causal_lm', num_labels=None)",
|
366 |
-
"model_meta": "ModelMeta(model_type='qwen2_5_vl', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5_vl', get_function=<function get_model_tokenizer_qwen2_5_vl at
|
367 |
"model_dir": "/mllm_hdd/yfzhang/data/models--Qwen--Qwen2.5-VL-7B-Instruct/snapshots/cc594898137f460bfe9f0759e9844b3ce807cfb5",
|
368 |
"hub": "<class 'swift.hub.hub.MSHub'>",
|
369 |
"evaluation_strategy": "epoch",
|
370 |
-
"training_args": "Seq2SeqTrainingArguments(output_dir='/mmu_mllm_hdd_2/yifanzhang/models/
|
371 |
}
|
|
|
14 |
"init_strategy": null,
|
15 |
"template": "qwen2_5_vl",
|
16 |
"system": "You are a helpful assistant.\n\nSolve the following problem step by step, and optionally write Python code for image manipulation to enhance your reasoning process. The Python code will be executed by an external sandbox, and the processed image or result (wrapped in <sandbox_output></sandbox_output>) can be returned to aid your reasoning and help you arrive at the final answer.\n\n**Reasoning & Image Manipulation (Optional but Encouraged):**\n * You have the capability to write executable Python code to perform image manipulations (e.g., cropping to a Region of Interest (ROI), resizing, rotation, adjusting contrast) or perform calculation for better reasoning.\n * The code will be executed in a secure sandbox, and its output will be provided back to you for further analysis.\n * All Python code snippets **must** be wrapped as follows:\n <code>\n ```python\n # your code.\n ```\n </code>\n * At the end of the code, print the path of the processed image (processed_path) or the result for further processing in a sandbox environment.",
|
17 |
+
"max_length": 10240,
|
18 |
"truncation_strategy": "delete",
|
19 |
"max_pixels": null,
|
20 |
"agent_template": null,
|
|
|
27 |
"use_chat_template": true,
|
28 |
"template_backend": "swift",
|
29 |
"dataset": [
|
30 |
+
"/mmu_mllm_hdd_2/yifanzhang/agent_latest_code/scripts/training_data/wo_system_image_180k_filter_w_image_size_filter_wo_some_code.jsonl",
|
31 |
"/mllm_hdd/yfzhang/Agent-R1/agent_latest_code/scripts/training_data/vstar_2step_nooverlap_training_filtered_70k.jsonl",
|
32 |
"/mllm_hdd/yfzhang/Agent-R1/agent_latest_code/scripts/training_data/vstar_2step_zoomin_training_filtered_70k.jsonl",
|
33 |
+
"/mllm_hdd/yfzhang/Agent-R1/agent_latest_code/scripts/training_data/wo_system_image_180k_filter_w_image_size_filter_wo_some_code_2round.jsonl"
|
|
|
34 |
],
|
35 |
"val_dataset": [],
|
36 |
"split_dataset_ratio": 0.01,
|
|
|
91 |
"ddp_backend": null,
|
92 |
"ignore_args_error": false,
|
93 |
"use_swift_lora": false,
|
94 |
+
"output_dir": "/mmu_mllm_hdd_2/yifanzhang/models/tool_final/qwen_tool_all_data_180k_alldata_wpgemini/v0-20250616-170052",
|
95 |
"overwrite_output_dir": false,
|
96 |
"do_train": false,
|
97 |
"do_eval": false,
|
|
|
121 |
"log_level": "passive",
|
122 |
"log_level_replica": "warning",
|
123 |
"log_on_each_node": true,
|
124 |
+
"logging_dir": "/mmu_mllm_hdd_2/yifanzhang/models/tool_final/qwen_tool_all_data_180k_alldata_wpgemini/v0-20250616-170052/runs",
|
125 |
"logging_strategy": "steps",
|
126 |
"logging_first_step": true,
|
127 |
"logging_steps": 5,
|
|
|
151 |
"debug": null,
|
152 |
"dataloader_drop_last": false,
|
153 |
"eval_steps": null,
|
154 |
+
"dataloader_num_workers": 4,
|
155 |
"dataloader_prefetch_factor": null,
|
156 |
"past_index": -1,
|
157 |
+
"run_name": "/mmu_mllm_hdd_2/yifanzhang/models/tool_final/qwen_tool_all_data_180k_alldata_wpgemini/v0-20250616-170052",
|
158 |
"disable_tqdm": null,
|
159 |
"label_names": null,
|
160 |
"load_best_model_at_end": false,
|
|
|
208 |
"group_by_length": false,
|
209 |
"length_column_name": "length",
|
210 |
"report_to": [
|
211 |
+
"wandb"
|
212 |
],
|
213 |
"ddp_find_unused_parameters": null,
|
214 |
"ddp_bucket_cap_mb": null,
|
|
|
218 |
"skip_memory_metrics": true,
|
219 |
"use_legacy_prediction_loop": false,
|
220 |
"push_to_hub": false,
|
221 |
+
"resume_from_checkpoint": null,
|
222 |
"hub_model_id": null,
|
223 |
"hub_strategy": "every_save",
|
224 |
"hub_private_repo": null,
|
|
|
362 |
"local_world_size": 8,
|
363 |
"model_suffix": "Qwen2.5-VL-7B-Instruct",
|
364 |
"model_info": "ModelInfo(model_type='qwen2_5_vl', model_dir='/mllm_hdd/yfzhang/data/models--Qwen--Qwen2.5-VL-7B-Instruct/snapshots/cc594898137f460bfe9f0759e9844b3ce807cfb5', torch_dtype=torch.bfloat16, max_model_len=128000, quant_method=None, quant_bits=None, rope_scaling={'type': 'default', 'mrope_section': [16, 24, 24], 'rope_type': 'default'}, config=None, task_type='causal_lm', num_labels=None)",
|
365 |
+
"model_meta": "ModelMeta(model_type='qwen2_5_vl', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-VL-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen2_5_vl', get_function=<function get_model_tokenizer_qwen2_5_vl at 0x7f24290f93f0>, model_arch='qwen2_vl', architectures=['Qwen2_5_VLForConditionalGeneration'], additional_saved_files=[], torch_dtype=None, is_multimodal=True, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.49', 'qwen_vl_utils>=0.0.6', 'decord'], tags=[])",
|
366 |
"model_dir": "/mllm_hdd/yfzhang/data/models--Qwen--Qwen2.5-VL-7B-Instruct/snapshots/cc594898137f460bfe9f0759e9844b3ce807cfb5",
|
367 |
"hub": "<class 'swift.hub.hub.MSHub'>",
|
368 |
"evaluation_strategy": "epoch",
|
369 |
+
"training_args": "Seq2SeqTrainingArguments(output_dir='/mmu_mllm_hdd_2/yifanzhang/models/tool_final/qwen_tool_all_data_180k_alldata_wpgemini/v0-20250616-170052', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=16, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=1e-05, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/mmu_mllm_hdd_2/yifanzhang/models/tool_final/qwen_tool_all_data_180k_alldata_wpgemini/v0-20250616-170052/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.EPOCH: 'epoch'>, save_steps=500, save_total_limit=5, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=None, dataloader_num_workers=4, dataloader_prefetch_factor=10, past_index=-1, run_name='/mmu_mllm_hdd_2/yifanzhang/models/tool_final/qwen_tool_all_data_180k_alldata_wpgemini/v0-20250616-170052', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, tp_size=0, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 2, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'allgather_partitions': True, 'allgather_bucket_size': 200000000.0, 'overlap_comm': False, 'reduce_scatter': True, 'reduce_bucket_size': 200000000.0, 'contiguous_gradients': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['wandb'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, max_epochs=None, aligner_lr=None, vit_lr=None, optimizer=None, metric_warmup_step=0, fsdp_num=1, acc_steps=1, eval_use_evalscope=False, eval_datasets=[], eval_limit=None, eval_datasets_args=None, eval_generation_config=None, train_type='full', local_repo_path=None, galore_config=None)"
|
370 |
}
|
global_step190/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3a4903ad115f1781a3edb94ab79f40c6a3f93dd4ca21281cf24f3e88e63f3787
|
3 |
+
size 11423432396
|
global_step190/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1ab83ce8fdb9ac4b4fc0ca1882154c9641aacf81f8a927303d8f58b11e55cd11
|
3 |
+
size 11423433356
|
global_step190/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d94c8b1026b52a76bbc2b5374078fc3effe97c91fe9a005c8e59018a1f6c9e6a
|
3 |
+
size 11423433484
|
global_step190/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:72a5c483c3d8b87af94a89e790569ef73775356b1b708f3389f404515d5c6f4c
|
3 |
+
size 11423433484
|
global_step190/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f37f8310b08ff06d9e444b866683020d05af5b16922c124b402543986809e1c3
|
3 |
+
size 11423433804
|
global_step190/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8aba1f366bef9718320eb5ea41a3adeb51c0faefafce2fe1279ca21e5398a9d0
|
3 |
+
size 11423433484
|
global_step190/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:15348b3d37035573222d7e44bc7b95f9c17645567c16d98a809d45c3b84a0481
|
3 |
+
size 11423433548
|
global_step190/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8f1b280d0de63d145ed723c43c1dd68f7b8197c8f793ef24dcdf839e39725505
|
3 |
+
size 11423432012
|
global_step190/mp_rank_00_model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:66e8390a7702e571fa6029e8cdb5bb672474458b5ea3c25162f8cbcfcf4f6d2d
|
3 |
+
size 17937826134
|
latest
CHANGED
@@ -1 +1 @@
|
|
1 |
-
|
|
|
1 |
+
global_step190
|
model-00001-of-00004.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4968243304
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:35dea09eb661006336ce549434a1307ff409722a6cf0a69c0e2fa6e6af2371df
|
3 |
size 4968243304
|
model-00002-of-00004.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4991495816
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:92f8355328bb494f9860ad8458b735e7b6db1ad2e0af409acb0f30586872a81b
|
3 |
size 4991495816
|
model-00003-of-00004.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4932751040
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9e90769ec71fd83d560abd732e2ca5d89c9df92145da2850d057c29a9fcffbce
|
3 |
size 4932751040
|
model-00004-of-00004.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1691924384
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:51eee4b151ea3d344bc810054c869d7fac286f9bff5ba4f2cd5deda76f0ccd2f
|
3 |
size 1691924384
|
rng_state_0.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5ff22c72cb50d4b6353cf336b950d2c54115c739606544de9a8d3b0fab0ef188
|
3 |
+
size 15920
|
rng_state_1.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c253ad05d3ffa98dc1e5291ec640e6158218602254206e6fb97ea82185040775
|
3 |
+
size 15920
|
rng_state_2.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:edcdbb53b869133655de3ceb14325bb161e7aa238640244125393ff4afcb7363
|
3 |
+
size 15920
|
rng_state_3.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b095e9807a468e05241a1d389fd326265bee89778655d3c00298c2f6abcee791
|
3 |
+
size 15920
|
rng_state_4.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a4da3805c54028ef89f9cce32b640836c54b432b251fae2dea3d0182fa96f403
|
3 |
+
size 15920
|
rng_state_5.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:08bb3c8d2c1053a6b93a1342acb589ea4695b3fdb4dfeaf3675191c7e3390d3d
|
3 |
+
size 15920
|
rng_state_6.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3cfebda0f5a673869a40a8a82d7e308c843f9b55ec287861a6a0670b4c43c9d4
|
3 |
+
size 15920
|
rng_state_7.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:34bc6aae481e0bf889f93cc019a7692fbdaba7925f341ec202dd711aa1d2a724
|
3 |
+
size 15920
|
scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d702a8a3f8a6bd01d9c0b9b47d00fb13493b6625b2b130d83744041615bf59ad
|
3 |
size 1064
|
trainer_state.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dd32b1597af89117014632dc1c0dfbea5e1e857567df2a15713b7750a4acf718
|
3 |
+
size 8056
|